

#ifndef FLAME_H
#define FLAME_H

// Allow C++ users to include this header file in their source code. However,
// we make the extern "C" conditional on whether we're using a C++ compiler,
// since regular C compilers don't understand the extern "C" construct.
#ifdef __cplusplus
extern "C" {
#endif 

  // Include autoconf-related preprocessor defines.
// begin FLA_config.h







#define F77_FUNC(name,NAME) name ## _


#define F77_FUNC_(name,NAME) name ## _

















#define FLA_ENABLE_BLIS1_USE_OF_FLA_MALLOC 1























#define FLA_ENABLE_INTERNAL_ERROR_CHECKING 1


#define FLA_ENABLE_LAPACK2FLAME 1























#define FLA_ENABLE_NON_CRITICAL_CODE 1


#define FLA_ENABLE_PORTABLE_TIMER 1

















#define FLA_INTERNAL_ERROR_CHECKING_LEVEL 2





#define FLA_MULTITHREADING_MODEL 0


#define FLA_PORTABLE_TIMER_IS_CLOCK_GETTIME 1








#define FLA_VECTOR_INTRINSIC_TYPE 0


#define HAVE_ASSERT_H 1


#define HAVE_FCNTL_H 1





#define HAVE_INTTYPES_H 1


#define HAVE_LIBM 1


#define HAVE_MATH_H 1


#define HAVE_MEMORY_H 1


#define HAVE_SIGNAL_H 1


#define HAVE_STDINT_H 1


#define HAVE_STDLIB_H 1


#define HAVE_STRINGS_H 1


#define HAVE_STRING_H 1


#define HAVE_SYS_STAT_H 1


#define HAVE_SYS_TIME_H 1


#define HAVE_SYS_TYPES_H 1


#define HAVE_UNISTD_H 1





#define PACKAGE_BUGREPORT ""


#define PACKAGE_NAME ""


#define PACKAGE_STRING ""


#define PACKAGE_TARNAME ""


#define PACKAGE_URL ""


#define PACKAGE_VERSION ""


#define PROTOTYPES 1


#define STDC_HEADERS 1


#define TIME_WITH_SYS_TIME 1


#define _GNU_SOURCE 1


#define __PROTOTYPES 1





#ifndef __cplusplus

#endif



// end FLA_config.h
// begin FLA_config_check.h


#ifdef FLA_ENABLE_WINDOWS_BUILD
#include <time.h> // skipped
#else
  // Handle the results of checking for time.h and sys/time.h
  #if TIME_WITH_SYS_TIME
#include <sys/time.h> // skipped
#include <time.h> // skipped
  #else
    #if HAVE_SYS_TIME_H
#include <sys/time.h> // skipped
    #else
#include <time.h> // skipped
    #endif
  #endif
#endif

// Handle the results of checking for ia64intrin.h. The contents of this header
// are required by the ia64 sections of FLA_Clock.c.
#ifdef HAVE_IA64INTRIN_H
#include <ia64intrin.h> // skipped
#endif

// end FLA_config_check.h

  // Include standard C header files.
#include <stdio.h> // skipped
#include <stdlib.h> // skipped
#include <stdarg.h> // skipped
#include <string.h> // skipped
  #ifdef FLA_ENABLE_WINDOWS_BUILD
#include <windows.h> // skipped
  #else
  #ifndef FLA_ENABLE_TIDSP
    // TI CG does not support POSIX
#include <unistd.h> // skipped
#include <fcntl.h> // skipped
#include <sys/types.h> // skipped
  #endif
  #endif
#include <math.h> // skipped
#include <float.h> // skipped
#include <signal.h> // skipped

  // Include prototypes for BLAS-like interfaces.
  #ifndef BLIS1_FROM_LIBFLAME
    #define BLIS1_FROM_LIBFLAME
  #endif
// begin blis1.h


#ifndef BLIS1_H
#define BLIS1_H

// Allow C++ users to include this header file in their source code. However,
// we make the extern "C" conditional on whether we're using a C++ compiler,
// since regular C compilers don't understand the extern "C" construct.
#ifdef __cplusplus
extern "C" {
#endif

#include <stdio.h> // skipped
#include <stdlib.h> // skipped
#include <math.h> // skipped

// Determine whether or not we are using BLIS from libflame.
//#define BLIS1_FROM_LIBFLAME

#ifdef BLIS1_FROM_LIBFLAME

  // If using libflame, pull in its header files so that
  // vector intrinsics-related macro constants are set properly.
  //#include "FLAME.h"
// begin FLA_config.h







#define F77_FUNC(name,NAME) name ## _


#define F77_FUNC_(name,NAME) name ## _

















#define FLA_ENABLE_BLIS1_USE_OF_FLA_MALLOC 1























#define FLA_ENABLE_INTERNAL_ERROR_CHECKING 1


#define FLA_ENABLE_LAPACK2FLAME 1























#define FLA_ENABLE_NON_CRITICAL_CODE 1


#define FLA_ENABLE_PORTABLE_TIMER 1

















#define FLA_INTERNAL_ERROR_CHECKING_LEVEL 2





#define FLA_MULTITHREADING_MODEL 0


#define FLA_PORTABLE_TIMER_IS_CLOCK_GETTIME 1








#define FLA_VECTOR_INTRINSIC_TYPE 0


#define HAVE_ASSERT_H 1


#define HAVE_FCNTL_H 1





#define HAVE_INTTYPES_H 1


#define HAVE_LIBM 1


#define HAVE_MATH_H 1


#define HAVE_MEMORY_H 1


#define HAVE_SIGNAL_H 1


#define HAVE_STDINT_H 1


#define HAVE_STDLIB_H 1


#define HAVE_STRINGS_H 1


#define HAVE_STRING_H 1


#define HAVE_SYS_STAT_H 1


#define HAVE_SYS_TIME_H 1


#define HAVE_SYS_TYPES_H 1


#define HAVE_UNISTD_H 1





#define PACKAGE_BUGREPORT ""


#define PACKAGE_NAME ""


#define PACKAGE_STRING ""


#define PACKAGE_TARNAME ""


#define PACKAGE_URL ""


#define PACKAGE_VERSION ""


#define PROTOTYPES 1


#define STDC_HEADERS 1


#define TIME_WITH_SYS_TIME 1


#define _GNU_SOURCE 1


#define __PROTOTYPES 1





#ifndef __cplusplus

#endif



// end FLA_config.h
// begin FLA_macro_defs.h




// --- Miscellaneous macro definitions -----------------------------------------

#undef  NULL
#define NULL 0

#ifdef FLA_ENABLE_WINDOWS_BUILD
  #define restrict  __restrict
#endif


// --- Type-related macro definitions ------------------------------------------

// FLA_Bool
#undef  TRUE
#undef  FALSE
#define TRUE  1
#define FALSE 0

// FLA_Error (non-specific)
#define FLA_SUCCESS           (-1)
#define FLA_FAILURE           (-2)

// FLA_Quadrant
#define FLA_TL                 11
#define FLA_TR                 12
#define FLA_BL                 21
#define FLA_BR                 22

// FLA_Datatype
#define FLA_FLOAT             100
#define FLA_DOUBLE            101
#define FLA_COMPLEX           102
#define FLA_DOUBLE_COMPLEX    103
#define FLA_INT               104
#define FLA_CONSTANT          105

// FLA_Elemtype
#define FLA_MATRIX            150
#define FLA_SCALAR            151

// FLA_Side
#define FLA_TOP               200
#define FLA_BOTTOM            201
#define FLA_LEFT              210
#define FLA_RIGHT             211
#define FLA_SIDE_MASK         0x1

// FLA_Uplo
#define FLA_LOWER_TRIANGULAR  300
#define FLA_UPPER_TRIANGULAR  301
#define FLA_ZERO_MATRIX       310
#define FLA_FULL_MATRIX       311
#define FLA_UPLO_MASK         0x1

// FLA_Trans
#define FLA_NO_TRANSPOSE      400
#define FLA_TRANSPOSE         401
#define FLA_CONJ_TRANSPOSE    402
#define FLA_CONJ_NO_TRANSPOSE 403
#define FLA_TRANS_MASK        0x3

// FLA_Conj
#define FLA_NO_CONJUGATE      450
#define FLA_CONJUGATE         451

// FLA_Diag
#define FLA_UNIT_DIAG         500
#define FLA_NONUNIT_DIAG      501
#define FLA_ZERO_DIAG         502
#define FLA_DIAG_MASK         0x3

// FLA_Dimension
#define FLA_DIMENSION_M       600
#define FLA_DIMENSION_K       601
#define FLA_DIMENSION_N       602
#define FLA_DIMENSION_MIN     603

// FLA_Dimension_index
#define FLA_DIM_M_INDEX         0
#define FLA_DIM_K_INDEX         1
#define FLA_DIM_N_INDEX         2
#define FLA_DIM_MIN_INDEX       3
#define FLA_DIM_INDEX_MASK    0x3

// FLA_Pivot_type
#define FLA_NATIVE_PIVOTS     700
#define FLA_LAPACK_PIVOTS     701

// FLA_Direct
#define FLA_FORWARD           800
#define FLA_BACKWARD          801

// FLA_Store
#define FLA_COLUMNWISE        900
#define FLA_ROWWISE           901

// FLA_Matrix_type
#define FLA_FLAT             1000
#define FLA_HIER             1001

// FLA_Precision
#define FLA_SINGLE_PRECISION 1100
#define FLA_DOUBLE_PRECISION 1101

// FLA_Domain
#define FLA_REAL_DOMAIN      1200
#define FLA_COMPLEX_DOMAIN   1201

// FLA_Inv    
#define FLA_NO_INVERSE       1300
#define FLA_INVERSE          1301

// FLA_Evd_type
#define FLA_EVD_WITHOUT_VECTORS         1400
#define FLA_EVD_WITH_VECTORS            1401
#define FLA_EVD_OF_TRIDIAG_WITH_VECTORS 1402

// FLA_Svd_type
#define FLA_SVD_VECTORS_ALL           1500
#define FLA_SVD_VECTORS_MIN_COPY      1501
#define FLA_SVD_VECTORS_MIN_OVERWRITE 1502
#define FLA_SVD_VECTORS_NONE          1503

// FLA_Machval
#define FLA_MACH_START                1600
#define FLA_MACH_EPS                  1600
#define FLA_MACH_SFMIN                1601
#define FLA_MACH_BASE                 1602
#define FLA_MACH_PREC                 1603
#define FLA_MACH_NDIGMANT             1604
#define FLA_MACH_RND                  1605
#define FLA_MACH_EMIN                 1606
#define FLA_MACH_RMIN                 1607
#define FLA_MACH_EMAX                 1608
#define FLA_MACH_RMAX                 1609
#define FLA_MACH_EPS2                 1610
#define FLA_MACH_N_VALS                 11

// FLA_Diag_off
#define FLA_SUPER_DIAGONAL     ( 1)
#define FLA_MAIN_DIAGONAL        0
#define FLA_SUB_DIAGONAL       (-1)

// FLAME threading model
#define FLA_OPENMP              1
#define FLA_PTHREADS            2

// FLAME vector intrinsics types
#define FLA_NO_INTRINSICS       0
#define FLA_SSE_INTRINSICS      3

// FLAME internal error checking level
#define FLA_FULL_ERROR_CHECKING 2
#define FLA_MIN_ERROR_CHECKING  1
#define FLA_NO_ERROR_CHECKING   0

// FLA_Datatype_index
#define FLA_S_INDEX             0
#define FLA_D_INDEX             1
#define FLA_C_INDEX             2
#define FLA_Z_INDEX             3
#define FLA_DTYPE_INDEX_MASK  0x3

// Default blocksize if none are available.
#ifndef FLA_DEFAULT_M_BLOCKSIZE
  #define FLA_DEFAULT_M_BLOCKSIZE  128
#endif
#ifndef FLA_DEFAULT_K_BLOCKSIZE
  #define FLA_DEFAULT_K_BLOCKSIZE  128
#endif
#ifndef FLA_DEFAULT_N_BLOCKSIZE
  #define FLA_DEFAULT_N_BLOCKSIZE  128
#endif

// QR and LQ factorizations typically has an inner blocksize that corresponds
// to the length of the S (or T) block Householder matrix. For consistency, we
// define the ratio of the inner blocksize to the outer blocksize here, as it
// is used in several places. Note that other operations have analagous inner
// blocksizes, which we also define in terms of the outer storage blocksize,
// or in some cases such as Hessenberg, tridiagonal, and bidiagonal reductions,
// in terms of the system-wide default blocksize.
#define FLA_QR_INNER_TO_OUTER_B_RATIO      (0.25)
#define FLA_LQ_INNER_TO_OUTER_B_RATIO      (0.25)
#define FLA_LU_INNER_TO_OUTER_B_RATIO      (0.25)
#define FLA_UDDATE_INNER_TO_OUTER_B_RATIO  (0.25)
#define FLA_HESS_INNER_TO_OUTER_B_RATIO    (0.25)
#define FLA_TRIDIAG_INNER_TO_OUTER_B_RATIO (0.25)
#define FLA_BIDIAG_INNER_TO_OUTER_B_RATIO  (0.25)
#define FLA_CAQR_INNER_TO_OUTER_B_RATIO    (0.25)



// --- Error-related macro definitions -----------------------------------------

// Useful when determining the relative index base of the error codes.
#define FLA_ERROR_CODE_MIN                    (-10)

// FLA_Error values.
#define FLA_INVALID_SIDE                      (-10)
#define FLA_INVALID_UPLO                      (-11)
#define FLA_INVALID_TRANS                     (-12)
#define FLA_INVALID_TRANS_GIVEN_DATATYPE      (-13)
#define FLA_INVALID_CONJ                      (-14)
#define FLA_INVALID_DIRECT                    (-15)
#define FLA_INVALID_STOREV                    (-16)
#define FLA_INVALID_DATATYPE                  (-17)
#define FLA_INVALID_INTEGER_DATATYPE          (-18)
#define FLA_INVALID_REAL_DATATYPE             (-19)
#define FLA_INVALID_COMPLEX_DATATYPE          (-20)
#define FLA_OBJECT_NOT_INTEGER                (-21)
#define FLA_OBJECT_NOT_REAL                   (-22)
#define FLA_OBJECT_NOT_COMPLEX                (-23)
#define FLA_OBJECT_NOT_SQUARE                 (-24)
#define FLA_OBJECT_NOT_SCALAR                 (-25)
#define FLA_OBJECT_NOT_VECTOR                 (-26)
#define FLA_INCONSISTENT_DATATYPES            (-27)
#define FLA_NONCONFORMAL_DIMENSIONS           (-28)
#define FLA_UNEQUAL_VECTOR_DIMS               (-29)
#define FLA_INVALID_HESSENBERG_INDICES        (-30)
#define FLA_NULL_POINTER                      (-32)
#define FLA_SPECIFIED_OBJ_DIM_MISMATCH        (-33)
#define FLA_INVALID_PIVOT_TYPE                (-35)
#define FLA_MALLOC_RETURNED_NULL_POINTER      (-37)
#define FLA_OBJECT_BASE_BUFFER_MISMATCH       (-38)
#define FLA_OBJECTS_NOT_VERTICALLY_ADJ        (-39)
#define FLA_OBJECTS_NOT_HORIZONTALLY_ADJ      (-40)
#define FLA_ADJACENT_OBJECT_DIM_MISMATCH      (-41)
#define FLA_OBJECTS_NOT_VERTICALLY_ALIGNED    (-42)
#define FLA_OBJECTS_NOT_HORIZONTALLY_ALIGNED  (-43)
#define FLA_INVALID_FLOATING_DATATYPE         (-44)
#define FLA_OBJECT_NOT_FLOATING_POINT         (-45)
#define FLA_INVALID_BLOCKSIZE_VALUE           (-46)
#define FLA_OPEN_RETURNED_ERROR               (-47)
#define FLA_LSEEK_RETURNED_ERROR              (-48)
#define FLA_CLOSE_RETURNED_ERROR              (-49)
#define FLA_UNLINK_RETURNED_ERROR             (-50)
#define FLA_READ_RETURNED_ERROR               (-51)
#define FLA_WRITE_RETURNED_ERROR              (-52)
#define FLA_INVALID_QUADRANT                  (-53)
#define FLA_NOT_YET_IMPLEMENTED               (-54)
#define FLA_EXPECTED_NONNEGATIVE_VALUE        (-55)
#define FLA_SUPERMATRIX_NOT_ENABLED           (-56)
#define FLA_UNDEFINED_ERROR_CODE              (-57)
#define FLA_INVALID_DIAG                      (-58)
#define FLA_INCONSISTENT_OBJECT_PRECISION     (-59)
#define FLA_INVALID_BLOCKSIZE_OBJ             (-60)
#define FLA_VECTOR_DIM_BELOW_MIN              (-61)
#define FLA_PTHREAD_CREATE_RETURNED_ERROR     (-63)
#define FLA_PTHREAD_JOIN_RETURNED_ERROR       (-64)
#define FLA_INVALID_ISGN_VALUE                (-65)
#define FLA_CHOL_FAILED_MATRIX_NOT_SPD        (-67)
#define FLA_INVALID_ELEMTYPE                  (-68)
#define FLA_POSIX_MEMALIGN_FAILED             (-69)
#define FLA_INVALID_SUBMATRIX_DIMS            (-70)
#define FLA_INVALID_SUBMATRIX_OFFSET          (-71)
#define FLA_OBJECT_NOT_SCALAR_ELEMTYPE        (-72)
#define FLA_OBJECT_NOT_MATRIX_ELEMTYPE        (-73)
#define FLA_ENCOUNTERED_NON_POSITIVE_NTHREADS (-74)
#define FLA_INVALID_CONJ_GIVEN_DATATYPE       (-75)
#define FLA_INVALID_COMPLEX_TRANS             (-76)
#define FLA_INVALID_REAL_TRANS                (-77)
#define FLA_INVALID_BLAS_TRANS                (-78)
#define FLA_INVALID_NONCONSTANT_DATATYPE      (-79)
#define FLA_OBJECT_NOT_NONCONSTANT            (-80)
#define FLA_OBJECT_DATATYPES_NOT_EQUAL        (-82)
#define FLA_DIVIDE_BY_ZERO                    (-83)
#define FLA_OBJECT_ELEMTYPES_NOT_EQUAL        (-84)
#define FLA_INVALID_PIVOT_INDEX_RANGE         (-85)
#define FLA_HOUSEH_PANEL_MATRIX_TOO_SMALL     (-86)
#define FLA_INVALID_OBJECT_LENGTH             (-87)
#define FLA_INVALID_OBJECT_WIDTH              (-88)
#define FLA_INVALID_ERROR_CHECKING_LEVEL      (-89)
#define FLA_ATTEMPTED_OVER_REPART_2X2         (-90)
#define FLA_ATTEMPTED_OVER_REPART_2X1         (-91)
#define FLA_ATTEMPTED_OVER_REPART_1X2         (-92)
#define FLA_EXTERNAL_LAPACK_NOT_IMPLEMENTED   (-93)
#define FLA_INVALID_ROW_STRIDE                (-94)
#define FLA_INVALID_COL_STRIDE                (-95)
#define FLA_INVALID_STRIDE_COMBINATION        (-96)
#define FLA_INVALID_VECTOR_DIM                (-97)
#define FLA_EXPECTED_ROW_VECTOR               (-98)
#define FLA_EXPECTED_COL_VECTOR               (-99)
#define FLA_INVALID_INVERSE                   (-100)
#define FLA_MALLOC_GPU_RETURNED_NULL_POINTER  (-101)
#define FLA_INVALID_EVD_TYPE                  (-102)
#define FLA_INVALID_SVD_TYPE                  (-103)
#define FLA_INVALID_MACHVAL                   (-104)
#define FLA_INVALID_DIAG_OFFSET               (-105)
#define FLA_EXPECTED_COL_STORAGE              (-106)
#define FLA_EXPECTED_ROW_STORAGE              (-107)
#define FLA_LAPAC2FLAME_INVALID_RETURN        (-108)
#define FLA_INVALID_SVD_TYPE_COMBINATION      (-109)
#define FLA_INVALID_SVD_TYPE_AND_TRANS_COMBINATION (-110)
#define FLA_OBJECT_NOT_COMPARABLE             (-111)

// Necessary when computing whether an error code is defined.
#define FLA_ERROR_CODE_MAX                    (-111)

// Internal string matrix limits.
#define FLA_MAX_NUM_ERROR_MSGS                 150
#define FLA_MAX_ERROR_MSG_LENGTH               200

// Error code translation and output macro definition.
#define FLA_Check_error_code( code ) \
        FLA_Check_error_code_helper( code, __FILE__, __LINE__ )



// --- Common functions implemented as macros ----------------------------------

#undef min
#define min( x, y ) ( (x) < (y) ? (x) : (y) )

#undef max
#define max( x, y ) ( (x) > (y) ? (x) : (y) )

#undef signof
#define signof( a, b ) ( (b) >= 0 ? (a) : -(a) )

#undef exchange
#define exchange( a, b, temp ) { temp = a; a = b; b = temp; }

// --- Other macro definitions -------------------------------------------------

#define FLA_NEGATE( a ) \
        ( a.base == FLA_ONE.base ? FLA_MINUS_ONE : FLA_ONE )


// end FLA_macro_defs.h
// begin FLA_type_defs.h


#ifndef FLA_TYPE_DEFS_H
#define FLA_TYPE_DEFS_H

#if   FLA_MULTITHREADING_MODEL == FLA_OPENMP
#ifdef FLA_ENABLE_TIDSP
#include <ti/omp/omp.h> // skipped
#else
#include <omp.h> // skipped
#endif
#elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
#include <pthread.h> // skipped
#endif


// --- Complex type definitions -----------------------------------------------

#ifndef _DEFINED_SCOMPLEX
#define _DEFINED_SCOMPLEX
typedef struct scomplex
{
  float real, imag;
} scomplex;
#endif

#ifndef _DEFINED_DCOMPLEX
#define _DEFINED_DCOMPLEX
typedef struct dcomplex
{
  double real, imag;
} dcomplex;
#endif


// --- Parameter and return type definitions ----------------------------------

typedef int FLA_Bool;
typedef int FLA_Error;
typedef int FLA_Quadrant;
typedef int FLA_Datatype;
typedef int FLA_Elemtype;
typedef int FLA_Side;
typedef int FLA_Uplo;
typedef int FLA_Trans;
typedef int FLA_Conj;
typedef int FLA_Diag;
typedef int FLA_Dimension;
typedef int FLA_Pivot_type;
typedef int FLA_Direct;
typedef int FLA_Store;
typedef int FLA_Matrix_type;
typedef int FLA_Precision;
typedef int FLA_Domain;
typedef int FLA_Inv;
typedef int FLA_Evd_type;
typedef int FLA_Svd_type;
typedef int FLA_Machval;
typedef int FLA_Diag_off;

#ifndef _DEFINED_DIM_T
#define _DEFINED_DIM_T
typedef unsigned long dim_t;
#endif

// --- Intrinsic/assembly definitions ----------------------------------------

#if FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS

#include "pmmintrin.h" // skipped

//typedef double v2df __attribute__ ((vector_size (16)));

typedef union
{
    __m128  v; 
    float   f[4];
} v4sf_t;

typedef union
{
    __m128d v; 
    double  d[2];
} v2df_t;

#endif

// --- FLAME object definitions -----------------------------------------------

typedef struct FLA_Lock_s     FLA_Lock;
typedef struct FLA_RWLock_s   FLA_RWLock;

//#ifdef FLA_ENABLE_MULTITHREADING
struct FLA_Lock_s
{
  // Implementation-specific lock object
#if   FLA_MULTITHREADING_MODEL == FLA_OPENMP
  omp_lock_t       lock;
#elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
  pthread_mutex_t  lock;
#endif
};
struct FLA_RWLock_s
{
  // Implementation-specific lock object
#if   FLA_MULTITHREADING_MODEL == FLA_OPENMP
  omp_lock_t       lock;
#elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
  pthread_rwlock_t lock;
#endif
};
//#endif

#ifdef FLA_ENABLE_SUPERMATRIX
typedef int                   FLASH_Verbose;
typedef int                   FLASH_Data_aff;

typedef struct FLASH_Queue_s  FLASH_Queue;
typedef struct FLASH_Task_s   FLASH_Task;
typedef struct FLASH_Dep_s    FLASH_Dep;
#endif
typedef struct FLASH_Thread_s FLASH_Thread;

typedef struct FLA_Obj_struct
{
  // Basic object description fields
  FLA_Datatype  datatype;
  FLA_Elemtype  elemtype;
  dim_t         m;
  dim_t         n;
  dim_t         rs;
  dim_t         cs;
  dim_t         m_inner;
  dim_t         n_inner;
  unsigned long id;
  dim_t         m_index;
  dim_t         n_index;

  dim_t         n_elem_alloc;
  void*         buffer;
  int           buffer_info;

  FLA_Uplo      uplo;

#ifdef FLA_ENABLE_SUPERMATRIX
  // Fields for supermatrix
  int           n_read_blocks;
  int           n_write_blocks;

  // All the tasks that previously read this block, anti-dependency
  int           n_read_tasks;
  FLASH_Dep*    read_task_head;
  FLASH_Dep*    read_task_tail;

  // Task that last overwrote this block, flow dependency
  FLASH_Task*   write_task;
#endif
} FLA_Base_obj;

typedef struct FLA_Obj_view
{
  // Basic object view description fields
  dim_t         offm;
  dim_t         offn;
  dim_t         m;
  dim_t         n;
  dim_t         m_inner;
  dim_t         n_inner;

  FLA_Base_obj* base;

} FLA_Obj;

#ifdef FLA_ENABLE_SUPERMATRIX
struct FLASH_Queue_s
{
  // Number of tasks currently in queue
  unsigned int  n_tasks;

  // Pointers to head (front) and tail (back) of queue
  FLASH_Task*   head;
  FLASH_Task*   tail;
};

struct FLASH_Task_s
{
  // Execution information
  int           n_ready;

  // Labels
  int           order;
  int           queue;
  int           height;
  int           thread;
  int           cache;
  FLA_Bool      hit;
      
  // Function pointer
  void*         func;

  // Control tree pointer
  void*         cntl;

  // Name of task
  char*         name;

  // GPU enabled task
  FLA_Bool      enabled_gpu;

  // HIP enabled task
  FLA_Bool      enabled_hip;

  // Integer arguments
  int           n_int_args;
  int*          int_arg;

  // Constant FLA_Obj arguments
  int           n_fla_args;
  FLA_Obj*      fla_arg;

  // Input FLA_Obj arguments
  int           n_input_args;
  FLA_Obj*      input_arg;

  // Output FLA_Obj argument
  int           n_output_args;
  FLA_Obj*      output_arg;

  // Number of blocks within all macroblocks
  int           n_macro_args;

  // Number of write after read dependencies
  int           n_war_args;

  // Dependence information
  int           n_dep_args;
  FLASH_Dep*    dep_arg_head;
  FLASH_Dep*    dep_arg_tail;
  
  // Support for a doubly linked list of tasks
  FLASH_Task*   prev_task;
  FLASH_Task*   next_task;

  // Support for a doubly linked list for wait queue
  FLASH_Task*   prev_wait;
  FLASH_Task*   next_wait;
};

struct FLASH_Dep_s
{
  // Task yielding dependency
  FLASH_Task*   task;

  // Support for linked list of FLASH_Deps
  FLASH_Dep*    next_dep;
};
#endif // FLA_ENABLE_SUPERMATRIX

struct FLASH_Thread_s
{
  // The thread's unique identifier
  int       id;

  // Pointer to variables needed to execute SuperMatrix mechanism
  void*     args;

#if FLA_MULTITHREADING_MODEL == FLA_PTHREADS
  // The thread object. Only needed for the POSIX threads implementation.
  pthread_t pthread_obj;
#endif
};

#endif // FLA_TYPE_DEFS_H
// end FLA_type_defs.h

  // --- Pass-through macros for BLIS ---
  #ifdef FLA_ENABLE_CBLAS_INTERFACES
    #define BLIS1_ENABLE_CBLAS_INTERFACES
  #endif
  #ifdef FLA_ENABLE_WINDOWS_BUILD
    #define BLIS1_ENABLE_WINDOWS_BUILD
  #endif
  #ifdef FLA_ENABLE_UPPERCASE_F77
    #define BLIS1_ENABLE_UPPERCASE_F77
  #endif
  #ifdef FLA_ENABLE_VECTOR_INTRINSICS
    #define BLIS1_ENABLE_VECTOR_INTRINSICS
  #endif

  #define BLIS1_VECTOR_INTRINSIC_TYPE FLA_VECTOR_INTRINSIC_TYPE

#else

  // --- BLIS configuration options ---

  // #define BLIS1_ENABLE_USE_OF_FLA_MALLOC
  // #define BLIS1_ENABLE_CBLAS_INTERFACES
  // #define BLIS1_ENABLE_WINDOWS_BUILD
  // #define BLIS1_ENABLE_UPPERCASE_F77
  // #define BLIS1_ENABLE_VECTOR_INTRINSICS
  //   #define BLIS1_VECTOR_INTRINSIC_TYPE BLIS1_NO_INTRINSICS
  //   #define BLIS1_VECTOR_INTRINSIC_TYPE BLIS1_SSE_INTRINSICS

#endif

// begin blis_macro_defs.h


#ifndef BLIS1_MACRO_DEFS_H
#define BLIS1_MACRO_DEFS_H

// --- Constants ---------------------------------------------------------------

#define BLIS1_NO_INTRINSICS  0
#define BLIS1_SSE_INTRINSICS 3

// --- boolean ---

#undef FALSE
#define FALSE 0

#undef TRUE
#define TRUE 1



// --- Functional macros -------------------------------------------------------

// --- Type-agnostic ---

// min, max, abs

#define bl1_min( a, b )  ( (a) < (b) ? (a) : (b) )
#define bl1_max( a, b )  ( (a) > (b) ? (a) : (b) )
#define bl1_abs( a )     ( (a) <= 0 ? -(a) : (a) )

// fmin, fmax, fabs

#define bl1_fmin( a, b ) bl1_min( a, b )
#define bl1_fmax( a, b ) bl1_max( a, b )
#define bl1_fabs( a )    ( (a) <= 0.0 ? -(a) : (a) )

// fminabs, fmaxabs
#define bl1_fminabs( a, b ) \
\
    bl1_fmin( bl1_fabs( a ), \
              bl1_fabs( b ) )

#define bl1_fmaxabs( a, b ) \
\
    bl1_fmax( bl1_fabs( a ), \
              bl1_fabs( b ) )

// --- Type-dependent ---

// --- neg1 ---

// void bl1_sneg1( float* x );
#define bl1_sneg1( x ) \
*(x)     *= -1.0F;

// void bl1_dneg1( double* x );
#define bl1_dneg1( x ) \
*(x)     *= -1.0;

// void bl1_cneg1( scomplex* x );
#define bl1_cneg1( x ) \
(x)->real *= -1.0F; \
(x)->imag *= -1.0F;

// void bl1_zneg1( dcomplex* x );
#define bl1_zneg1( x ) \
(x)->real *= -1.0; \
(x)->imag *= -1.0;

// --- neg2 ---

// void bl1_sneg2( float* x, float* y );
#define bl1_sneg2( x, y ) \
*(y)      = -1.0F * *(x);

// void bl1_dneg2( double* x, double* y );
#define bl1_dneg2( x, y ) \
*(y)      = -1.0  * *(x);

// void bl1_cneg2( scomplex* x, scomplex* y );
#define bl1_cneg2( x, y ) \
(y)->real = -1.0F * (x)->real; \
(y)->imag = -1.0F * (x)->imag;

// void bl1_zneg2( dcomplex* x, dcomplex* y );
#define bl1_zneg2( x, y ) \
(y)->real = -1.0  * (x)->real; \
(y)->imag = -1.0  * (x)->imag;

// --- sqrte ---

// void bl1_ssqrte( float* alpha, int* error );
#define bl1_ssqrte( alpha, error ) \
if ( *(alpha)      <= 0.0F || isnan( *(alpha) ) ) {  *(error) = FLA_FAILURE; } \
else { *(alpha)      =  ( float ) sqrt( *(alpha) );  *(error) = FLA_SUCCESS; }

// void bl1_dsqrte( double* alpha, int* error );
#define bl1_dsqrte( alpha, error ) \
if ( *(alpha)      <= 0.0 || isnan( *(alpha) ) ) {   *(error) = FLA_FAILURE; } \
else { *(alpha)      = ( double ) sqrt( *(alpha) );  *(error) = FLA_SUCCESS; }

// void bl1_csqrte( scomplex* alpha, int* error );
#define bl1_csqrte( alpha, error ) \
if ( (alpha)->real <= 0.0F || isnan( (alpha)->real) ) \
{                     *(error) = FLA_FAILURE; } \
else { \
(alpha)->real =  ( float ) sqrt( (alpha)->real ); \
(alpha)->imag = 0.0F; *(error) = FLA_SUCCESS; }

// void bl1_zsqrte( dcomplex* alpha, int* error );
#define bl1_zsqrte( alpha, error ) \
if ( (alpha)->real <= 0.0 || isnan( (alpha)->real) )  \
{                     *(error) = FLA_FAILURE; } \
else { \
(alpha)->real = ( double ) sqrt( (alpha)->real ); \
(alpha)->imag = 0.0;  *(error) = FLA_SUCCESS; }

// --- absval2 ---

// void bl1_sabsval2( float* alpha, float* absval );
#define bl1_sabsval2( alpha, absval ) \
*(absval) = ( float ) fabs( ( double ) *(alpha) );

// void bl1_dabsval2( double* alpha, double* absval );
#define bl1_dabsval2( alpha, absval ) \
*(absval) = fabs( *(alpha) );

// void bl1_cabsval2( scomplex* x, scomplex* a );
#define bl1_cabsval2( x, a ) \
{ \
	float  s   = bl1_fmaxabs( (x)->real, (x)->imag ); \
	float  mag = sqrtf( s ) * \
	             sqrtf( ( (x)->real / s ) * (x)->real + \
	                    ( (x)->imag / s ) * (x)->imag ); \
	(a)->real   = mag; \
	(a)->imag   = 0.0F; \
}

// void bl1_csabsval2( scomplex* x, float* a );
#define bl1_csabsval2( x, a ) \
{ \
	float  s   = bl1_fmaxabs( (x)->real, (x)->imag ); \
	float  mag = sqrtf( s ) * \
	             sqrtf( ( (x)->real / s ) * (x)->real + \
	                    ( (x)->imag / s ) * (x)->imag ); \
	*(a)       = mag; \
}

// void bl1_zabsval2( dcomplex* x, dcomplex* a );
#define bl1_zabsval2( x, a ) \
{ \
	double s   = bl1_fmaxabs( (x)->real, (x)->imag ); \
	double mag = sqrt( s ) * \
	             sqrt( ( (x)->real / s ) * (x)->real + \
	                   ( (x)->imag / s ) * (x)->imag ); \
	(a)->real   = mag; \
	(a)->imag   = 0.0; \
}

// void bl1_zdabsval2( dcomplex* x, double* a );
#define bl1_zdabsval2( x, a ) \
{ \
	double s   = bl1_fmaxabs( (x)->real, (x)->imag ); \
	double mag = sqrt( s ) * \
	             sqrt( ( (x)->real / s ) * (x)->real + \
	                   ( (x)->imag / s ) * (x)->imag ); \
	*(a)       = mag; \
}


// --- absqr ---

// void bl1_sabsqr( float* alpha );
#define bl1_sabsqr( alpha ) \
*(alpha) = *(alpha) * *(alpha);

// void bl1_dabsqr( double* alpha );
#define bl1_dabsqr( alpha ) \
*(alpha) = *(alpha) * *(alpha);

// void bl1_cabsqr( scomplex* alpha );
#define bl1_cabsqr( alpha ) \
(alpha)->real = (alpha)->real * (alpha)->real + (alpha)->imag * (alpha)->imag; \
(alpha)->imag = 0.0F;

// void bl1_zabsqr( dcomplex* alpha );
#define bl1_zabsqr( alpha ) \
(alpha)->real = (alpha)->real * (alpha)->real + (alpha)->imag * (alpha)->imag; \
(alpha)->imag = 0.0;

// --- invscals ---

// void bl1_sinvscals( float* a, float* y );
#define bl1_sinvscals( a, y ) \
*(y) = *(y) / *(a);

// void bl1_dinvscals( double* a, double* y );
#define bl1_dinvscals( a, y ) \
*(y) = *(y) / *(a);

// void bl1_csinvscals( float* a, scomplex* y );
#define bl1_csinvscals( a, y ) \
{ \
(y)->real = (y)->real / *(a); \
(y)->imag = (y)->imag / *(a); \
}

// void bl1_cinvscals( scomplex* a, scomplex* y );
#define bl1_cinvscals( a, y ) \
{ \
	float  s     = bl1_fmaxabs( (a)->real, (a)->imag ); \
	float  ar_s  = (a)->real / s; \
	float  ai_s  = (a)->imag / s; \
	float  yrt   = (y)->real; \
	float  temp  = ( ar_s * (a)->real + ai_s * (a)->imag ); \
	(y)->real    = ( (yrt)     * ar_s + (y)->imag * ai_s ) / temp; \
	(y)->imag    = ( (y)->imag * ar_s - (yrt)     * ai_s ) / temp; \
}

// void bl1_zdinvscals( double* a, dcomplex* y );
#define bl1_zdinvscals( a, y ) \
{ \
(y)->real = (y)->real / *(a); \
(y)->imag = (y)->imag / *(a); \
}

// void bl1_zinvscals( dcomplex* a, dcomplex* y );
#define bl1_zinvscals( a, y ) \
{ \
	double s     = bl1_fmaxabs( (a)->real, (a)->imag ); \
	double ar_s  = (a)->real / s; \
	double ai_s  = (a)->imag / s; \
	double yrt   = (y)->real; \
	double temp  = ( ar_s * (a)->real + ai_s * (a)->imag ); \
	(y)->real    = ( (yrt)     * ar_s + (y)->imag * ai_s ) / temp; \
	(y)->imag    = ( (y)->imag * ar_s - (yrt)     * ai_s ) / temp; \
}

// --- div3 ---

// void bl1_sdiv3( float* x, float* y, float* a );
#define bl1_sdiv3( x, y, a ) \
*(a) = *(x) / *(y);

// void bl1_ddiv3( double* x, double* y, double* a );
#define bl1_ddiv3( x, y, a ) \
*(a) = *(x) / *(y);

// void bl1_cdiv3( scomplex* x, scomplex* y, scomplex* a );
// a = x / y;
#define bl1_cdiv3( x, y, a ) \
{ \
	*a = *x; \
	bl1_cinvscals( y, a ); \
}

// void bl1_zdiv3( dcomplex* x, dcomplex* y, dcomplex* a );
#define bl1_zdiv3( x, y, a ) \
{ \
	*a = *x; \
	bl1_zinvscals( y, a ); \
}

// --- add3 ---

// void bl1_sadd3( float* x, float* y, float* a );
#define bl1_sadd3( x, y, a ) \
*(a) = *(x) + *(y);

// void bl1_dadd3( double* x, double* y, double* a );
#define bl1_dadd3( x, y, a ) \
*(a) = *(x) + *(y);

// void bl1_cadd3( scomplex* x, scomplex* y, scomplex* a );
#define bl1_cadd3( x, y, a ) \
{ \
(a)->real = (x)->real + (y)->real; \
(a)->imag = (x)->imag + (y)->imag; \
}

// void bl1_zadd3( dcomplex* x, dcomplex* y, dcomplex* a );
#define bl1_zadd3( x, y, a ) \
{ \
(a)->real = (x)->real + (y)->real; \
(a)->imag = (x)->imag + (y)->imag; \
}

// --- copys ---

// void bl1_scopys( conj1_t conj, float* x, float* y );
#define bl1_scopys( conj, x, y ) \
*(y) = *(x);

// void bl1_dcopys( conj1_t conj, double* x, double* y );
#define bl1_dcopys( conj, x, y ) \
*(y) = *(x);

// void bl1_ccopys( conj1_t conj, scomplex* x, scomplex* y );
#define bl1_ccopys( conj, x, y ) \
*(y) = *(x); \
if ( bl1_is_conj( conj ) ) (y)->imag *= -1.0F;

// void bl1_zcopys( conj1_t conj, dcomplex* x, dcomplex* y );
#define bl1_zcopys( conj, x, y ) \
*(y) = *(x); \
if ( bl1_is_conj( conj ) ) (y)->imag *= -1.0;

// --- scals ---

// void bl1_sscals( float* a, float* y );
#define bl1_sscals( a, y ) \
*(y) = *(a) * *(y);

// void bl1_dscals( double* a, double* y );
#define bl1_dscals( a, y ) \
*(y) = *(a) * *(y);

// void bl1_csscals( float* a, scomplex* y );
#define bl1_csscals( a, y ) \
{ \
(y)->real = *(a) * (y)->real; \
(y)->imag = *(a) * (y)->imag; \
}

// void bl1_cscals( scomplex* a, scomplex* y );
#define bl1_cscals( a, y ) \
{ \
float tempr = (a)->real * (y)->real - (a)->imag * (y)->imag; \
float tempi = (a)->imag * (y)->real + (a)->real * (y)->imag; \
(y)->real = tempr; \
(y)->imag = tempi; \
}

// void bl1_zdscals( double* a, dcomplex* y );
#define bl1_zdscals( a, y ) \
{ \
(y)->real = *(a) * (y)->real; \
(y)->imag = *(a) * (y)->imag; \
}

// void bl1_zscals( dcomplex* a, dcomplex* y );
#define bl1_zscals( a, y ) \
{ \
double tempr = (a)->real * (y)->real - (a)->imag * (y)->imag; \
double tempi = (a)->imag * (y)->real + (a)->real * (y)->imag; \
(y)->real = tempr; \
(y)->imag = tempi; \
}

// --- mult3 ---

// void bl1_smult3( float* x, float* y, float* a );
#define bl1_smult3( x, y, a ) \
*(a) = *(x) * *(y);

// void bl1_dmult3( double* x, double* y, double* a );
#define bl1_dmult3( x, y, a ) \
*(a) = *(x) * *(y);

// void bl1_cmult3( scomplex* x, scomplex* y, scomplex* a );
#define bl1_cmult3( x, y, a ) \
{ \
float tempr = (x)->real * (y)->real - (x)->imag * (y)->imag; \
float tempi = (x)->imag * (y)->real + (x)->real * (y)->imag; \
(a)->real = tempr; \
(a)->imag = tempi; \
}

// void bl1_zmult3( dcomplex* x, dcomplex* y, dcomplex* a );
#define bl1_zmult3( x, y, a ) \
{ \
double tempr = (x)->real * (y)->real - (x)->imag * (y)->imag; \
double tempi = (x)->imag * (y)->real + (x)->real * (y)->imag; \
(a)->real = tempr; \
(a)->imag = tempi; \
}

// --- mult4 ---

// void bl1_smult4( float* alpha, float* x, float* y1, float* y2 );
#define bl1_smult4( alpha, x, y1, y2 ) \
*(y2) = *(y1) + *(alpha) * *(x);

// void bl1_dmult4( double* alpha, double* x, double* y1, double* y2 );
#define bl1_dmult4( alpha, x, y1, y2 ) \
*(y2) = *(y1) + *(alpha) * *(x);

// void bl1_cmult4( scomplex* alpha, scomplex* x, scomplex* y1, scomplex* y2 );
#define bl1_cmult4( alpha, x, y1, y2 ) \
{ \
(y2)->real = (y1)->real + (alpha)->real * (x)->real - (alpha)->imag * (x)->imag; \
(y2)->imag = (y1)->imag + (alpha)->imag * (x)->real + (alpha)->real * (x)->imag; \
}

// void bl1_zmult4( dcomplex* alpha, dcomplex* x, dcomplex* y1, dcomplex* y2 );
#define bl1_zmult4( alpha, x, y1, y2 ) \
{ \
(y2)->real = (y1)->real + (alpha)->real * (x)->real - (alpha)->imag * (x)->imag; \
(y2)->imag = (y1)->imag + (alpha)->imag * (x)->real + (alpha)->real * (x)->imag; \
}

// --- conjs ---

// void bl1_sconjs( float* a );
#define bl1_sconjs( a ) \
;

// void bl1_dconjs( double* a );
#define bl1_dconjs( a ) \
;

// void bl1_cconjs( scomplex* a );
#define bl1_cconjs( a ) \
(a)->imag *= -1.0F;

// void bl1_zconjs( dcomplex* a );
#define bl1_zconjs( a ) \
(a)->imag *= -1.0;

// --- copyconj ---

// void bl1_scopyconj( float* x, float* y );
#define bl1_scopyconj( x, y ) \
*(y) = *(x);

// void bl1_dcopyconj( double* x, double* y );
#define bl1_dcopyconj( x, y ) \
*(y) = *(x);

// void bl1_ccopyconj( scomplex* x, scomplex* y );
#define bl1_ccopyconj( x, y ) \
(y)->real =         (x)->real; \
(y)->imag = -1.0F * (x)->imag;

// void bl1_zcopyconj( dcomplex* x, dcomplex* y );
#define bl1_zcopyconj( x, y ) \
(y)->real =         (x)->real; \
(y)->imag = -1.0  * (x)->imag;

// --- eq1 ---

// void bl1_seq1( float* alpha );
#define bl1_seq1( alpha ) \
  ( *alpha == 1.0F )

// void bl1_deq1( double* alpha );
#define bl1_deq1( alpha ) \
  ( *alpha == 1.0 )

// void bl1_ceq1( scomplex* alpha );
#define bl1_ceq1( alpha ) \
  ( (alpha)->real == 1.0F && (alpha)->imag == 0.0F )

// void bl1_zeq1( dcomplex* alpha );
#define bl1_zeq1( alpha ) \
  ( (alpha)->real == 1.0 && (alpha)->imag == 0.0 )

// --- Swapping/toggle macros --------------------------------------------------

// --- swap_pointers ---

#define bl1_sswap_pointers( a, b ) \
{ \
float* temp = (a); \
(a) = (b); \
(b) = temp; \
}

#define bl1_dswap_pointers( a, b ) \
{ \
double* temp = (a); \
(a) = (b); \
(b) = temp; \
}

#define bl1_cswap_pointers( a, b ) \
{ \
void* temp = (a); \
(a) = (b); \
(b) = temp; \
}

#define bl1_zswap_pointers( a, b ) \
{ \
void* temp = (a); \
(a) = (b); \
(b) = temp; \
}

// --- swap_ints ---

#define bl1_swap_ints( a, b ) \
{ \
int temp = (a); \
(a) = (b); \
(b) = temp; \
}

// --- swap_trans ---

#define bl1_swap_trans( a, b ) \
{ \
trans1_t temp = (a); \
(a) = (b); \
(b) = temp; \
}

// --- swap_conj ---

#define bl1_swap_conj( a, b ) \
{ \
conj1_t temp = (a); \
(a) = (b); \
(b) = temp; \
}

// --- toggle_side ---

#define bl1_toggle_side( side ) \
{ \
if ( bl1_is_left( side ) ) side = BLIS1_RIGHT; \
else                       side = BLIS1_LEFT; \
}

// --- toggle_uplo ---

#define bl1_toggle_uplo( uplo ) \
{ \
if ( bl1_is_lower( uplo ) ) uplo = BLIS1_UPPER_TRIANGULAR; \
else                        uplo = BLIS1_LOWER_TRIANGULAR; \
}

// --- toggle_trans ---
#define bl1_toggle_trans( trans ) \
{ \
if      ( bl1_is_notrans( trans ) )     trans = BLIS1_TRANSPOSE; \
else if ( bl1_is_trans( trans ) )       trans = BLIS1_NO_TRANSPOSE; \
else if ( bl1_is_conjnotrans( trans ) ) trans = BLIS1_CONJ_TRANSPOSE; \
else                                    trans = BLIS1_CONJ_NO_TRANSPOSE; \
}

// --- toggle_conjtrans ---
#define bl1_toggle_conjtrans( trans ) \
{ \
if      ( bl1_is_notrans( trans ) )     trans = BLIS1_CONJ_TRANSPOSE; \
else                                    trans = BLIS1_NO_TRANSPOSE; \
}

// --- toggle_conj ---

#define bl1_toggle_conj( conj ) \
{ \
if ( bl1_is_conj( conj ) ) conj = BLIS1_NO_CONJUGATE; \
else                       conj = BLIS1_CONJUGATE; \
}

#endif // #ifndef BLIS1_MACRO_DEFS_H
// end blis_macro_defs.h
// begin blis_type_defs.h


#ifndef BLIS1_TYPE_DEFS_H
#define BLIS1_TYPE_DEFS_H

// --- Basic type definitions -------------------------------------------------



#define BLIS1_TRANS_BEGIN 100
#define BLIS1_UPLO_BEGIN  200
#define BLIS1_SIDE_BEGIN  300
#define BLIS1_DIAG_BEGIN  400
#define BLIS1_CONJ_BEGIN  500

typedef enum
{
	BLIS1_NO_TRANSPOSE = BLIS1_TRANS_BEGIN,
	BLIS1_TRANSPOSE,
	BLIS1_CONJ_NO_TRANSPOSE,
	BLIS1_CONJ_TRANSPOSE
} trans1_t;

typedef enum
{
	BLIS1_LOWER_TRIANGULAR = BLIS1_UPLO_BEGIN,
	BLIS1_UPPER_TRIANGULAR
} uplo1_t;

typedef enum
{
	BLIS1_LEFT = BLIS1_SIDE_BEGIN,
	BLIS1_RIGHT
} side1_t;

typedef enum
{
	BLIS1_NONUNIT_DIAG = BLIS1_DIAG_BEGIN,
	BLIS1_UNIT_DIAG,
	BLIS1_ZERO_DIAG
} diag1_t;

typedef enum
{
	BLIS1_NO_CONJUGATE = BLIS1_CONJ_BEGIN,
	BLIS1_CONJUGATE
} conj1_t;





// --- Intrinsic/assembly definitions ----------------------------------------





// Only define vector intrinsics types if they are not already provided by
// libflame.
#ifndef BLIS1_FROM_LIBFLAME

#if BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_SSE_INTRINSICS

#include "pmmintrin.h" // skipped
typedef union
{
    __m128d v; 
    double  d[2];
} v2df_t;
#endif

#endif


// --- Complex type definitions -----------------------------------------------

// Only define complex types if they are not already provided by libflame.
//#ifndef BLIS1_ENABLE_USE_OF_LIBFLAME_TYPES
#ifndef BLIS1_FROM_LIBFLAME

typedef struct scomplex
{
  float real, imag;
} scomplex;

typedef struct dcomplex
{
  double real, imag;
} dcomplex;

#endif


#endif // BLIS1_TYPE_DEFS_H
// end blis_type_defs.h

// begin blis_prototypes_util.h


// --- Utility-level BLAS-like prototypes --------------------------------------

// --- constant-generating functions ---

float    bl1_s2( void );
double   bl1_d2( void );
scomplex bl1_c2( void );
dcomplex bl1_z2( void );
float    bl1_s1( void );
double   bl1_d1( void );
scomplex bl1_c1( void );
dcomplex bl1_z1( void );
float    bl1_s1h( void );
double   bl1_d1h( void );
scomplex bl1_c1h( void );
dcomplex bl1_z1h( void );
float    bl1_s0( void );
double   bl1_d0( void );
scomplex bl1_c0( void );
dcomplex bl1_z0( void );
float    bl1_sm1h( void );
double   bl1_dm1h( void );
scomplex bl1_cm1h( void );
dcomplex bl1_zm1h( void );
float    bl1_sm1( void );
double   bl1_dm1( void );
scomplex bl1_cm1( void );
dcomplex bl1_zm1( void );
float    bl1_sm2( void );
double   bl1_dm2( void );
scomplex bl1_cm2( void );
dcomplex bl1_zm2( void );

// --- allocv ---

void*     bl1_vallocv( unsigned int n_elem, unsigned int elem_size );
int*      bl1_iallocv( unsigned int n_elem );
float*    bl1_sallocv( unsigned int n_elem );
double*   bl1_dallocv( unsigned int n_elem );
scomplex* bl1_callocv( unsigned int n_elem );
dcomplex* bl1_zallocv( unsigned int n_elem );

// --- allocm ---

void*     bl1_vallocm( unsigned int m, unsigned int n, unsigned int elem_size );
int*      bl1_iallocm( unsigned int m, unsigned int n );
float*    bl1_sallocm( unsigned int m, unsigned int n );
double*   bl1_dallocm( unsigned int m, unsigned int n );
scomplex* bl1_callocm( unsigned int m, unsigned int n );
dcomplex* bl1_zallocm( unsigned int m, unsigned int n );

// --- apdiagmv ---

void bl1_sapdiagmv( side1_t side, conj1_t conj, int m, int n, float*    x, int incx, float*    a, int a_rs, int a_cs );
void bl1_dapdiagmv( side1_t side, conj1_t conj, int m, int n, double*   x, int incx, double*   a, int a_rs, int a_cs );
void bl1_csapdiagmv( side1_t side, conj1_t conj, int m, int n, float*    x, int incx, scomplex* a, int a_rs, int a_cs );
void bl1_capdiagmv( side1_t side, conj1_t conj, int m, int n, scomplex* x, int incx, scomplex* a, int a_rs, int a_cs );
void bl1_zdapdiagmv( side1_t side, conj1_t conj, int m, int n, double*   x, int incx, dcomplex* a, int a_rs, int a_cs );
void bl1_zapdiagmv( side1_t side, conj1_t conj, int m, int n, dcomplex* x, int incx, dcomplex* a, int a_rs, int a_cs );

// --- create_contigm ---

void bl1_screate_contigm( int m, int n, float*    a_save, int a_rs_save, int a_cs_save, float**    a, int* a_rs, int* a_cs );
void bl1_dcreate_contigm( int m, int n, double*   a_save, int a_rs_save, int a_cs_save, double**   a, int* a_rs, int* a_cs );
void bl1_ccreate_contigm( int m, int n, scomplex* a_save, int a_rs_save, int a_cs_save, scomplex** a, int* a_rs, int* a_cs );
void bl1_zcreate_contigm( int m, int n, dcomplex* a_save, int a_rs_save, int a_cs_save, dcomplex** a, int* a_rs, int* a_cs );

// --- create_contigmt ---

void bl1_screate_contigmt( trans1_t trans_dims, int m, int n, float*    a_save, int a_rs_save, int a_cs_save, float**    a, int* a_rs, int* a_cs );
void bl1_dcreate_contigmt( trans1_t trans_dims, int m, int n, double*   a_save, int a_rs_save, int a_cs_save, double**   a, int* a_rs, int* a_cs );
void bl1_ccreate_contigmt( trans1_t trans_dims, int m, int n, scomplex* a_save, int a_rs_save, int a_cs_save, scomplex** a, int* a_rs, int* a_cs );
void bl1_zcreate_contigmt( trans1_t trans_dims, int m, int n, dcomplex* a_save, int a_rs_save, int a_cs_save, dcomplex** a, int* a_rs, int* a_cs );

// --- create_contigmr ---

void bl1_screate_contigmr( uplo1_t uplo, int m, int n, float*    a_save, int a_rs_save, int a_cs_save, float**    a, int* a_rs, int* a_cs );
void bl1_dcreate_contigmr( uplo1_t uplo, int m, int n, double*   a_save, int a_rs_save, int a_cs_save, double**   a, int* a_rs, int* a_cs );
void bl1_ccreate_contigmr( uplo1_t uplo, int m, int n, scomplex* a_save, int a_rs_save, int a_cs_save, scomplex** a, int* a_rs, int* a_cs );
void bl1_zcreate_contigmr( uplo1_t uplo, int m, int n, dcomplex* a_save, int a_rs_save, int a_cs_save, dcomplex** a, int* a_rs, int* a_cs );

// --- create_contigmsr ---

void bl1_screate_contigmsr( side1_t side, uplo1_t uplo, int m, int n, float*    a_save, int a_rs_save, int a_cs_save, float**    a, int* a_rs, int* a_cs );
void bl1_dcreate_contigmsr( side1_t side, uplo1_t uplo, int m, int n, double*   a_save, int a_rs_save, int a_cs_save, double**   a, int* a_rs, int* a_cs );
void bl1_ccreate_contigmsr( side1_t side, uplo1_t uplo, int m, int n, scomplex* a_save, int a_rs_save, int a_cs_save, scomplex** a, int* a_rs, int* a_cs );
void bl1_zcreate_contigmsr( side1_t side, uplo1_t uplo, int m, int n, dcomplex* a_save, int a_rs_save, int a_cs_save, dcomplex** a, int* a_rs, int* a_cs );

// --- free_contigm ---

void bl1_sfree_contigm( float*    a_save, int a_rs_save, int a_cs_save, float**    a, int* a_rs, int* a_cs );
void bl1_dfree_contigm( double*   a_save, int a_rs_save, int a_cs_save, double**   a, int* a_rs, int* a_cs );
void bl1_cfree_contigm( scomplex* a_save, int a_rs_save, int a_cs_save, scomplex** a, int* a_rs, int* a_cs );
void bl1_zfree_contigm( dcomplex* a_save, int a_rs_save, int a_cs_save, dcomplex** a, int* a_rs, int* a_cs );

// --- free_saved_contigm ---

void bl1_sfree_saved_contigm( int m, int n, float*    a_save, int a_rs_save, int a_cs_save, float**    a, int* a_rs, int* a_cs );
void bl1_dfree_saved_contigm( int m, int n, double*   a_save, int a_rs_save, int a_cs_save, double**   a, int* a_rs, int* a_cs );
void bl1_cfree_saved_contigm( int m, int n, scomplex* a_save, int a_rs_save, int a_cs_save, scomplex** a, int* a_rs, int* a_cs );
void bl1_zfree_saved_contigm( int m, int n, dcomplex* a_save, int a_rs_save, int a_cs_save, dcomplex** a, int* a_rs, int* a_cs );

// --- free_saved_contigmr ---

void bl1_sfree_saved_contigmr( uplo1_t uplo, int m, int n, float*    a_save, int a_rs_save, int a_cs_save, float**    a, int* a_rs, int* a_cs );
void bl1_dfree_saved_contigmr( uplo1_t uplo, int m, int n, double*   a_save, int a_rs_save, int a_cs_save, double**   a, int* a_rs, int* a_cs );
void bl1_cfree_saved_contigmr( uplo1_t uplo, int m, int n, scomplex* a_save, int a_rs_save, int a_cs_save, scomplex** a, int* a_rs, int* a_cs );
void bl1_zfree_saved_contigmr( uplo1_t uplo, int m, int n, dcomplex* a_save, int a_rs_save, int a_cs_save, dcomplex** a, int* a_rs, int* a_cs );

// --- free_saved_contigmsr ---

void bl1_sfree_saved_contigmsr( side1_t side, uplo1_t uplo, int m, int n, float*    a_save, int a_rs_save, int a_cs_save, float**    a, int* a_rs, int* a_cs );
void bl1_dfree_saved_contigmsr( side1_t side, uplo1_t uplo, int m, int n, double*   a_save, int a_rs_save, int a_cs_save, double**   a, int* a_rs, int* a_cs );
void bl1_cfree_saved_contigmsr( side1_t side, uplo1_t uplo, int m, int n, scomplex* a_save, int a_rs_save, int a_cs_save, scomplex** a, int* a_rs, int* a_cs );
void bl1_zfree_saved_contigmsr( side1_t side, uplo1_t uplo, int m, int n, dcomplex* a_save, int a_rs_save, int a_cs_save, dcomplex** a, int* a_rs, int* a_cs );

// --- ewinvscalv ---

void bl1_sewinvscalv( conj1_t conj, int n, float*    x, int incx, float*    y, int incy );
void bl1_dewinvscalv( conj1_t conj, int n, double*   x, int incx, double*   y, int incy );
void bl1_csewinvscalv( conj1_t conj, int n, float*    x, int incx, scomplex* y, int incy );
void bl1_cewinvscalv( conj1_t conj, int n, scomplex* x, int incx, scomplex* y, int incy );
void bl1_zdewinvscalv( conj1_t conj, int n, double*   x, int incx, dcomplex* y, int incy );
void bl1_zewinvscalv( conj1_t conj, int n, dcomplex* x, int incx, dcomplex* y, int incy );

// --- ewscalmt ---

void bl1_sewinvscalmt( trans1_t trans, int m, int n, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_dewinvscalmt( trans1_t trans, int m, int n, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_csewinvscalmt( trans1_t trans, int m, int n, float*    a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_cewinvscalmt( trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_zdewinvscalmt( trans1_t trans, int m, int n, double*   a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );
void bl1_zewinvscalmt( trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );

// --- ewscalv ---

void bl1_sewscalv( conj1_t conj, int n, float*    x, int incx, float*    y, int incy );
void bl1_dewscalv( conj1_t conj, int n, double*   x, int incx, double*   y, int incy );
void bl1_csewscalv( conj1_t conj, int n, float*    x, int incx, scomplex* y, int incy );
void bl1_cewscalv( conj1_t conj, int n, scomplex* x, int incx, scomplex* y, int incy );
void bl1_zdewscalv( conj1_t conj, int n, double*   x, int incx, dcomplex* y, int incy );
void bl1_zewscalv( conj1_t conj, int n, dcomplex* x, int incx, dcomplex* y, int incy );

// --- ewscalmt ---

void bl1_sewscalmt( trans1_t trans, int m, int n, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_dewscalmt( trans1_t trans, int m, int n, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_csewscalmt( trans1_t trans, int m, int n, float*    a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_cewscalmt( trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_zdewscalmt( trans1_t trans, int m, int n, double*   a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );
void bl1_zewscalmt( trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );

// --- free ---

void bl1_vfree( void*     p );
void bl1_ifree( int*      p );
void bl1_sfree( float*    p );
void bl1_dfree( double*   p );
void bl1_cfree( scomplex* p );
void bl1_zfree( dcomplex* p );

// --- inverts ---

void bl1_sinverts( conj1_t conj, float*    alpha );
void bl1_dinverts( conj1_t conj, double*   alpha );
void bl1_cinverts( conj1_t conj, scomplex* alpha );
void bl1_zinverts( conj1_t conj, dcomplex* alpha );

// --- invert2s ---

void bl1_sinvert2s( conj1_t conj, float*    alpha, float*    beta );
void bl1_dinvert2s( conj1_t conj, double*   alpha, double*   beta );
void bl1_cinvert2s( conj1_t conj, scomplex* alpha, scomplex* beta );
void bl1_zinvert2s( conj1_t conj, dcomplex* alpha, dcomplex* beta );

// --- invertv ---

void bl1_sinvertv( conj1_t conj, int n, float*    x, int incx );
void bl1_dinvertv( conj1_t conj, int n, double*   x, int incx );
void bl1_cinvertv( conj1_t conj, int n, scomplex* x, int incx );
void bl1_zinvertv( conj1_t conj, int n, dcomplex* x, int incx );

// --- ident ---

void bl1_sident( int m, float*    a, int a_rs, int a_cs );
void bl1_dident( int m, double*   a, int a_rs, int a_cs );
void bl1_cident( int m, scomplex* a, int a_rs, int a_cs );
void bl1_zident( int m, dcomplex* a, int a_rs, int a_cs );

// --- maxabsv ---

void bl1_smaxabsv( int n, float*    x, int incx, float*  maxabs );
void bl1_dmaxabsv( int n, double*   x, int incx, double* maxabs );
void bl1_cmaxabsv( int n, scomplex* x, int incx, float*  maxabs );
void bl1_zmaxabsv( int n, dcomplex* x, int incx, double* maxabs );

// --- maxabsm ---

void bl1_smaxabsm( int m, int n, float*    a, int a_rs, int a_cs, float*  maxabs );
void bl1_dmaxabsm( int m, int n, double*   a, int a_rs, int a_cs, double* maxabs );
void bl1_cmaxabsm( int m, int n, scomplex* a, int a_rs, int a_cs, float*  maxabs );
void bl1_zmaxabsm( int m, int n, dcomplex* a, int a_rs, int a_cs, double* maxabs );

// --- maxabsmr ---

void bl1_smaxabsmr( uplo1_t uplo, int m, int n, float*    a, int a_rs, int a_cs, float*  maxabs );
void bl1_dmaxabsmr( uplo1_t uplo, int m, int n, double*   a, int a_rs, int a_cs, double* maxabs );
void bl1_cmaxabsmr( uplo1_t uplo, int m, int n, scomplex* a, int a_rs, int a_cs, float*  maxabs );
void bl1_zmaxabsmr( uplo1_t uplo, int m, int n, dcomplex* a, int a_rs, int a_cs, double* maxabs );

// --- rands ---

void bl1_srands( float*    alpha );
void bl1_drands( double*   alpha );
void bl1_crands( scomplex* alpha );
void bl1_zrands( dcomplex* alpha );

// --- randv ---

void bl1_srandv( int n, float*    x, int incx );
void bl1_drandv( int n, double*   x, int incx );
void bl1_crandv( int n, scomplex* x, int incx );
void bl1_zrandv( int n, dcomplex* x, int incx );

// --- randm ---

void bl1_srandm( int m, int n, float*    a, int a_rs, int a_cs );
void bl1_drandm( int m, int n, double*   a, int a_rs, int a_cs );
void bl1_crandm( int m, int n, scomplex* a, int a_rs, int a_cs );
void bl1_zrandm( int m, int n, dcomplex* a, int a_rs, int a_cs );

// --- randmr ---
void bl1_srandmr( uplo1_t uplo, diag1_t diag, int m, int n, float*    a, int a_rs, int a_cs );
void bl1_drandmr( uplo1_t uplo, diag1_t diag, int m, int n, double*   a, int a_rs, int a_cs );
void bl1_crandmr( uplo1_t uplo, diag1_t diag, int m, int n, scomplex* a, int a_rs, int a_cs );
void bl1_zrandmr( uplo1_t uplo, diag1_t diag, int m, int n, dcomplex* a, int a_rs, int a_cs );

// --- set_contig_strides ---

void bl1_set_contig_strides( int m, int n, int* rs, int* cs );

// --- set_dims_with_side ---

void bl1_set_dim_with_side( side1_t side, int m, int n, int* dim_new );

// --- set_dims_with_trans ---

void bl1_set_dims_with_trans( trans1_t trans, int m, int n, int* m_new, int* n_new );

// --- setv ---

void bl1_isetv( int m, int*      sigma, int*      x, int incx );
void bl1_ssetv( int m, float*    sigma, float*    x, int incx );
void bl1_dsetv( int m, double*   sigma, double*   x, int incx );
void bl1_csetv( int m, scomplex* sigma, scomplex* x, int incx );
void bl1_zsetv( int m, dcomplex* sigma, dcomplex* x, int incx );

// --- setm ---

void bl1_isetm( int m, int n, int*      sigma, int*      a, int a_rs, int a_cs );
void bl1_ssetm( int m, int n, float*    sigma, float*    a, int a_rs, int a_cs );
void bl1_dsetm( int m, int n, double*   sigma, double*   a, int a_rs, int a_cs );
void bl1_csetm( int m, int n, scomplex* sigma, scomplex* a, int a_rs, int a_cs );
void bl1_zsetm( int m, int n, dcomplex* sigma, dcomplex* a, int a_rs, int a_cs );

// --- setmr ---

void bl1_ssetmr( uplo1_t uplo, int m, int n, float*    sigma, float*    a, int a_rs, int a_cs );
void bl1_dsetmr( uplo1_t uplo, int m, int n, double*   sigma, double*   a, int a_rs, int a_cs );
void bl1_csetmr( uplo1_t uplo, int m, int n, scomplex* sigma, scomplex* a, int a_rs, int a_cs );
void bl1_zsetmr( uplo1_t uplo, int m, int n, dcomplex* sigma, dcomplex* a, int a_rs, int a_cs );

// --- setdiag ---

void bl1_isetdiag( int offset, int m, int n, int*      sigma, int*      a, int a_rs, int a_cs );
void bl1_ssetdiag( int offset, int m, int n, float*    sigma, float*    a, int a_rs, int a_cs );
void bl1_dsetdiag( int offset, int m, int n, double*   sigma, double*   a, int a_rs, int a_cs );
void bl1_csetdiag( int offset, int m, int n, scomplex* sigma, scomplex* a, int a_rs, int a_cs );
void bl1_zsetdiag( int offset, int m, int n, dcomplex* sigma, dcomplex* a, int a_rs, int a_cs );

// --- scalediag ---

void bl1_sscalediag( conj1_t conj, int offset, int m, int n, float*    sigma, float*    a, int a_rs, int a_cs );
void bl1_dscalediag( conj1_t conj, int offset, int m, int n, double*   sigma, double*   a, int a_rs, int a_cs );
void bl1_cscalediag( conj1_t conj, int offset, int m, int n, scomplex* sigma, scomplex* a, int a_rs, int a_cs );
void bl1_zscalediag( conj1_t conj, int offset, int m, int n, dcomplex* sigma, dcomplex* a, int a_rs, int a_cs );
void bl1_csscalediag( conj1_t conj, int offset, int m, int n, float*    sigma, scomplex* a, int a_rs, int a_cs );
void bl1_zdscalediag( conj1_t conj, int offset, int m, int n, double*   sigma, dcomplex* a, int a_rs, int a_cs );

// --- shiftdiag ---

void bl1_sshiftdiag( conj1_t conj, int offset, int m, int n, float*    sigma, float*    a, int a_rs, int a_cs );
void bl1_dshiftdiag( conj1_t conj, int offset, int m, int n, double*   sigma, double*   a, int a_rs, int a_cs );
void bl1_cshiftdiag( conj1_t conj, int offset, int m, int n, scomplex* sigma, scomplex* a, int a_rs, int a_cs );
void bl1_zshiftdiag( conj1_t conj, int offset, int m, int n, dcomplex* sigma, dcomplex* a, int a_rs, int a_cs );
void bl1_csshiftdiag( conj1_t conj, int offset, int m, int n, float*    sigma, scomplex* a, int a_rs, int a_cs );
void bl1_zdshiftdiag( conj1_t conj, int offset, int m, int n, double*   sigma, dcomplex* a, int a_rs, int a_cs );

// --- symmize ---

void bl1_ssymmize( conj1_t conj, uplo1_t uplo, int m, float*    a, int a_rs, int a_cs );
void bl1_dsymmize( conj1_t conj, uplo1_t uplo, int m, double*   a, int a_rs, int a_cs );
void bl1_csymmize( conj1_t conj, uplo1_t uplo, int m, scomplex* a, int a_rs, int a_cs );
void bl1_zsymmize( conj1_t conj, uplo1_t uplo, int m, dcomplex* a, int a_rs, int a_cs );

// end blis_prototypes_util.h
// begin blis_prototypes_query.h


// --- Query routine prototypes ------------------------------------------------

// --- trans ---

int bl1_does_trans( trans1_t trans );
int bl1_does_notrans( trans1_t trans );
int bl1_does_conj( trans1_t trans );

int bl1_is_notrans( trans1_t trans );
int bl1_is_trans( trans1_t trans );
int bl1_is_conjnotrans( trans1_t trans );
int bl1_is_conjtrans( trans1_t trans );

// --- conj ---

int bl1_is_noconj( conj1_t conj );
int bl1_is_conj( conj1_t conj );

// --- uplo ---

int bl1_is_lower( uplo1_t uplo );
int bl1_is_upper( uplo1_t uplo );

// --- side ---

int bl1_is_left( side1_t side );
int bl1_is_right( side1_t side );

// --- diag ---

int bl1_is_nonunit_diag( diag1_t diag );
int bl1_is_unit_diag( diag1_t diag );
int bl1_is_zero_diag( diag1_t diag );

// --- mapping-related ---

conj1_t bl1_proj_trans1_to_conj( trans1_t trans );

// --- storage-related ---

void bl1_check_storage_3m( int a_rs, int a_cs, int b_rs, int b_cs, int c_rs, int c_cs );
void bl1_check_storage_2m( int a_rs, int a_cs, int b_rs, int b_cs );
int bl1_is_row_or_col_storage( int rs, int cs );
int bl1_is_row_storage( int rs, int cs );
int bl1_is_col_storage( int rs, int cs );
int bl1_is_gen_storage( int rs, int cs );
int bl1_is_vector( int m, int n );

// --- vector-related ---

int bl1_vector_dim( int m, int n );
int bl1_vector_inc( trans1_t trans, int m, int n, int rs, int cs );

// --- dimension-related ---

int bl1_zero_dim1( int m );
int bl1_zero_dim2( int m, int n );
int bl1_zero_dim3( int m, int k, int n );

// end blis_prototypes_query.h
// begin blis_prototypes_misc.h


// --- Abort prototypes --------------------------------------------------------

void bl1_abort( void );
void bl1_abort_msg( char* message );

// --- Parameter-mapping prototypes --------------------------------------------

void bl1_param_map_to_netlib_trans( trans1_t blis_trans, void* blas_trans );
void bl1_param_map_to_netlib_uplo(  uplo1_t  blis_uplo,  void* blas_uplo );
void bl1_param_map_to_netlib_side(  side1_t  blis_side,  void* blas_side );
void bl1_param_map_to_netlib_diag(  diag1_t  blis_diag,  void* blas_diag );

// end blis_prototypes_misc.h

// begin blis_prototypes_level1.h


// --- Level-1 BLAS-like prototypes --------------------------------------------

// --- amax ---

void bl1_samax( int n, float*    x, int incx, int* index );
void bl1_damax( int n, double*   x, int incx, int* index );
void bl1_camax( int n, scomplex* x, int incx, int* index );
void bl1_zamax( int n, dcomplex* x, int incx, int* index );

// --- asum ---

void bl1_sasum( int n, float*    x, int incx, float*  norm );
void bl1_dasum( int n, double*   x, int incx, double* norm );
void bl1_casum( int n, scomplex* x, int incx, float*  norm );
void bl1_zasum( int n, dcomplex* x, int incx, double* norm );

// --- axpy ---

void bl1_saxpy( int n, float*    alpha, float*    x, int incx, float*    y, int incy );
void bl1_daxpy( int n, double*   alpha, double*   x, int incx, double*   y, int incy );
void bl1_caxpy( int n, scomplex* alpha, scomplex* x, int incx, scomplex* y, int incy );
void bl1_zaxpy( int n, dcomplex* alpha, dcomplex* x, int incx, dcomplex* y, int incy );

// --- axpyv ---

void bl1_saxpyv( conj1_t conj, int n, float*    alpha, float*    x, int incx, float*    y, int incy );
void bl1_daxpyv( conj1_t conj, int n, double*   alpha, double*   x, int incx, double*   y, int incy );
void bl1_caxpyv( conj1_t conj, int n, scomplex* alpha, scomplex* x, int incx, scomplex* y, int incy );
void bl1_zaxpyv( conj1_t conj, int n, dcomplex* alpha, dcomplex* x, int incx, dcomplex* y, int incy );

// --- axpymt ---

void bl1_saxpymt( trans1_t trans, int m, int n, float*    alpha, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_daxpymt( trans1_t trans, int m, int n, double*   alpha, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_caxpymt( trans1_t trans, int m, int n, scomplex* alpha, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_zaxpymt( trans1_t trans, int m, int n, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );

// --- axpymrt ---

void bl1_saxpymrt( uplo1_t uplo, trans1_t trans, int m, int n, float*    alpha, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_daxpymrt( uplo1_t uplo, trans1_t trans, int m, int n, double*   alpha, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_caxpymrt( uplo1_t uplo, trans1_t trans, int m, int n, scomplex* alpha, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_zaxpymrt( uplo1_t uplo, trans1_t trans, int m, int n, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );

// --- axpysv ---

void bl1_saxpysv( int n, float*    alpha0, float*    alpha1, float*    x, int incx, float*    beta, float*    y, int incy );
void bl1_daxpysv( int n, double*   alpha0, double*   alpha1, double*   x, int incx, double*   beta, double*   y, int incy );
void bl1_caxpysv( int n, scomplex* alpha0, scomplex* alpha1, scomplex* x, int incx, scomplex* beta, scomplex* y, int incy );
void bl1_zaxpysv( int n, dcomplex* alpha0, dcomplex* alpha1, dcomplex* x, int incx, dcomplex* beta, dcomplex* y, int incy );

// --- axpysmt ---

void bl1_saxpysmt( trans1_t trans, int m, int n, float*    alpha0, float*    alpha1, float*    a, int a_rs, int a_cs, float*    beta, float*    b, int b_rs, int b_cs );
void bl1_daxpysmt( trans1_t trans, int m, int n, double*   alpha0, double*   alpha1, double*   a, int a_rs, int a_cs, double*   beta, double*   b, int b_rs, int b_cs );
void bl1_caxpysmt( trans1_t trans, int m, int n, scomplex* alpha0, scomplex* alpha1, scomplex* a, int a_rs, int a_cs, scomplex* beta, scomplex* b, int b_rs, int b_cs );
void bl1_zaxpysmt( trans1_t trans, int m, int n, dcomplex* alpha0, dcomplex* alpha1, dcomplex* a, int a_rs, int a_cs, dcomplex* beta, dcomplex* b, int b_rs, int b_cs );

// --- conjv ---

void bl1_sconjv( int m, float* x, int incx );
void bl1_dconjv( int m, double* x, int incx );
void bl1_cconjv( int m, scomplex* x, int incx );
void bl1_zconjv( int m, dcomplex* x, int incx );

// --- conjm ---

void bl1_sconjm( int m, int n, float*    a, int a_rs, int a_cs );
void bl1_dconjm( int m, int n, double*   a, int a_rs, int a_cs );
void bl1_cconjm( int m, int n, scomplex* a, int a_rs, int a_cs );
void bl1_zconjm( int m, int n, dcomplex* a, int a_rs, int a_cs );

// --- conjmr ---

void bl1_sconjmr( uplo1_t uplo, int m, int n, float*    a, int a_rs, int a_cs );
void bl1_dconjmr( uplo1_t uplo, int m, int n, double*   a, int a_rs, int a_cs );
void bl1_cconjmr( uplo1_t uplo, int m, int n, scomplex* a, int a_rs, int a_cs );
void bl1_zconjmr( uplo1_t uplo, int m, int n, dcomplex* a, int a_rs, int a_cs );

// --- copy ---

void bl1_scopy( int m, float*    x, int incx, float*    y, int incy );
void bl1_dcopy( int m, double*   x, int incx, double*   y, int incy );
void bl1_ccopy( int m, scomplex* x, int incx, scomplex* y, int incy );
void bl1_zcopy( int m, dcomplex* x, int incx, dcomplex* y, int incy );

// --- copyv ---

void bl1_icopyv( conj1_t conj, int m, int*      x, int incx, int*      y, int incy );
void bl1_scopyv( conj1_t conj, int m, float*    x, int incx, float*    y, int incy );
void bl1_dcopyv( conj1_t conj, int m, double*   x, int incx, double*   y, int incy );
void bl1_ccopyv( conj1_t conj, int m, scomplex* x, int incx, scomplex* y, int incy );
void bl1_zcopyv( conj1_t conj, int m, dcomplex* x, int incx, dcomplex* y, int incy );

void bl1_sdcopyv( conj1_t conj, int m, float*    x, int incx, double*   y, int incy );
void bl1_dscopyv( conj1_t conj, int m, double*   x, int incx, float*    y, int incy );
void bl1_sccopyv( conj1_t conj, int m, float*    x, int incx, scomplex* y, int incy );
void bl1_cscopyv( conj1_t conj, int m, scomplex* x, int incx, float*    y, int incy );
void bl1_szcopyv( conj1_t conj, int m, float*    x, int incx, dcomplex* y, int incy );
void bl1_zscopyv( conj1_t conj, int m, dcomplex* x, int incx, float*    y, int incy );
void bl1_dccopyv( conj1_t conj, int m, double*   x, int incx, scomplex* y, int incy );
void bl1_cdcopyv( conj1_t conj, int m, scomplex* x, int incx, double*   y, int incy );
void bl1_dzcopyv( conj1_t conj, int m, double*   x, int incx, dcomplex* y, int incy );
void bl1_zdcopyv( conj1_t conj, int m, dcomplex* x, int incx, double*   y, int incy );
void bl1_czcopyv( conj1_t conj, int m, scomplex* x, int incx, dcomplex* y, int incy );
void bl1_zccopyv( conj1_t conj, int m, dcomplex* x, int incx, scomplex* y, int incy );

// --- copymr ---

void bl1_scopymr( uplo1_t uplo, int m, int n, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_dcopymr( uplo1_t uplo, int m, int n, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_ccopymr( uplo1_t uplo, int m, int n, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_zcopymr( uplo1_t uplo, int m, int n, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );

void bl1_sscopymr( uplo1_t uplo, int m, int n, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_sdcopymr( uplo1_t uplo, int m, int n, float*    a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_dscopymr( uplo1_t uplo, int m, int n, double*   a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_sccopymr( uplo1_t uplo, int m, int n, float*    a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_cscopymr( uplo1_t uplo, int m, int n, scomplex* a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_szcopymr( uplo1_t uplo, int m, int n, float*    a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );
void bl1_zscopymr( uplo1_t uplo, int m, int n, dcomplex* a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_ddcopymr( uplo1_t uplo, int m, int n, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_dccopymr( uplo1_t uplo, int m, int n, double*   a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_cdcopymr( uplo1_t uplo, int m, int n, scomplex* a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_dzcopymr( uplo1_t uplo, int m, int n, double*   a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );
void bl1_zdcopymr( uplo1_t uplo, int m, int n, dcomplex* a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_cccopymr( uplo1_t uplo, int m, int n, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_czcopymr( uplo1_t uplo, int m, int n, scomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );
void bl1_zccopymr( uplo1_t uplo, int m, int n, dcomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_zzcopymr( uplo1_t uplo, int m, int n, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );

// --- copymrt ---

void bl1_scopymrt( uplo1_t uplo, trans1_t trans, int m, int n, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_dcopymrt( uplo1_t uplo, trans1_t trans, int m, int n, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_ccopymrt( uplo1_t uplo, trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_zcopymrt( uplo1_t uplo, trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );

void bl1_sscopymrt( uplo1_t uplo, trans1_t trans, int m, int n, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_sdcopymrt( uplo1_t uplo, trans1_t trans, int m, int n, float*    a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_sccopymrt( uplo1_t uplo, trans1_t trans, int m, int n, float*    a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_szcopymrt( uplo1_t uplo, trans1_t trans, int m, int n, float*    a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );
void bl1_dscopymrt( uplo1_t uplo, trans1_t trans, int m, int n, double*   a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_ddcopymrt( uplo1_t uplo, trans1_t trans, int m, int n, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_dccopymrt( uplo1_t uplo, trans1_t trans, int m, int n, double*   a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_dzcopymrt( uplo1_t uplo, trans1_t trans, int m, int n, double*   a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );
void bl1_cscopymrt( uplo1_t uplo, trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_cdcopymrt( uplo1_t uplo, trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_cccopymrt( uplo1_t uplo, trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_czcopymrt( uplo1_t uplo, trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );
void bl1_zscopymrt( uplo1_t uplo, trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_zdcopymrt( uplo1_t uplo, trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_zccopymrt( uplo1_t uplo, trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_zzcopymrt( uplo1_t uplo, trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );

// --- copymt ---

void bl1_icopymt( trans1_t trans, int m, int n, int*      a, int a_rs, int a_cs, int*      b, int b_rs, int b_cs );
void bl1_scopymt( trans1_t trans, int m, int n, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_dcopymt( trans1_t trans, int m, int n, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_ccopymt( trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_zcopymt( trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );

void bl1_sscopymt( trans1_t trans, int m, int n, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_sdcopymt( trans1_t trans, int m, int n, float*    a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_dscopymt( trans1_t trans, int m, int n, double*   a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_sccopymt( trans1_t trans, int m, int n, float*    a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_cscopymt( trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_szcopymt( trans1_t trans, int m, int n, float*    a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );
void bl1_zscopymt( trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_ddcopymt( trans1_t trans, int m, int n, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_dccopymt( trans1_t trans, int m, int n, double*   a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_cdcopymt( trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_dzcopymt( trans1_t trans, int m, int n, double*   a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );
void bl1_zdcopymt( trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_cccopymt( trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_czcopymt( trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );
void bl1_zccopymt( trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_zzcopymt( trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );

// --- dot ---

void bl1_cdot_in( conj1_t conj, int n, scomplex* x, int incx, scomplex* y, int incy, scomplex* rho );
void bl1_zdot_in( conj1_t conj, int n, dcomplex* x, int incx, dcomplex* y, int incy, dcomplex* rho );

void bl1_sdot( conj1_t conj, int n, float*    x, int incx, float*    y, int incy, float*    rho );
void bl1_ddot( conj1_t conj, int n, double*   x, int incx, double*   y, int incy, double*   rho );
void bl1_cdot( conj1_t conj, int n, scomplex* x, int incx, scomplex* y, int incy, scomplex* rho );
void bl1_zdot( conj1_t conj, int n, dcomplex* x, int incx, dcomplex* y, int incy, dcomplex* rho );

// --- dots ---

void bl1_sdots( conj1_t conj, int n, float*    alpha, float*    x, int incx, float*    y, int incy, float*    beta, float*    rho );
void bl1_ddots( conj1_t conj, int n, double*   alpha, double*   x, int incx, double*   y, int incy, double*   beta, double*   rho );
void bl1_cdots( conj1_t conj, int n, scomplex* alpha, scomplex* x, int incx, scomplex* y, int incy, scomplex* beta, scomplex* rho );
void bl1_zdots( conj1_t conj, int n, dcomplex* alpha, dcomplex* x, int incx, dcomplex* y, int incy, dcomplex* beta, dcomplex* rho );

// --- dot2s ---

void bl1_sdot2s( conj1_t conj, int n, float*    alpha, float*    x, int incx, float*    y, int incy, float*    beta, float*    rho );
void bl1_ddot2s( conj1_t conj, int n, double*   alpha, double*   x, int incx, double*   y, int incy, double*   beta, double*   rho );
void bl1_cdot2s( conj1_t conj, int n, scomplex* alpha, scomplex* x, int incx, scomplex* y, int incy, scomplex* beta, scomplex* rho );
void bl1_zdot2s( conj1_t conj, int n, dcomplex* alpha, dcomplex* x, int incx, dcomplex* y, int incy, dcomplex* beta, dcomplex* rho );

// --- fnorm ---

void bl1_sfnorm( int m, int n, float*    a, int a_rs, int a_cs, float*  norm );
void bl1_dfnorm( int m, int n, double*   a, int a_rs, int a_cs, double* norm );
void bl1_cfnorm( int m, int n, scomplex* a, int a_rs, int a_cs, float*  norm );
void bl1_zfnorm( int m, int n, dcomplex* a, int a_rs, int a_cs, double* norm );

// --- invscalv ---

void bl1_sinvscalv(  conj1_t conj, int n, float*    alpha, float*    x, int incx );
void bl1_dinvscalv(  conj1_t conj, int n, double*   alpha, double*   x, int incx );
void bl1_csinvscalv( conj1_t conj, int n, float*    alpha, scomplex* x, int incx );
void bl1_cinvscalv(  conj1_t conj, int n, scomplex* alpha, scomplex* x, int incx );
void bl1_zdinvscalv( conj1_t conj, int n, double*   alpha, dcomplex* x, int incx );
void bl1_zinvscalv(  conj1_t conj, int n, dcomplex* alpha, dcomplex* x, int incx );

// --- invscalm ---

void bl1_sinvscalm(  conj1_t conj, int m, int n, float*    alpha, float*    a, int a_rs, int a_cs );
void bl1_dinvscalm(  conj1_t conj, int m, int n, double*   alpha, double*   a, int a_rs, int a_cs );
void bl1_csinvscalm( conj1_t conj, int m, int n, float*    alpha, scomplex* a, int a_rs, int a_cs );
void bl1_cinvscalm(  conj1_t conj, int m, int n, scomplex* alpha, scomplex* a, int a_rs, int a_cs );
void bl1_zdinvscalm( conj1_t conj, int m, int n, double*   alpha, dcomplex* a, int a_rs, int a_cs );
void bl1_zinvscalm(  conj1_t conj, int m, int n, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs );

// --- nrm2 ---

void bl1_snrm2( int n, float*    x, int incx, float*  norm );
void bl1_dnrm2( int n, double*   x, int incx, double* norm );
void bl1_cnrm2( int n, scomplex* x, int incx, float*  norm );
void bl1_znrm2( int n, dcomplex* x, int incx, double* norm );

// --- scal ---

void bl1_sscal(  int n, float*    alpha, float*    x, int incx );
void bl1_dscal(  int n, double*   alpha, double*   x, int incx );
void bl1_csscal( int n, float*    alpha, scomplex* x, int incx );
void bl1_cscal(  int n, scomplex* alpha, scomplex* x, int incx );
void bl1_zdscal( int n, double*   alpha, dcomplex* x, int incx );
void bl1_zscal(  int n, dcomplex* alpha, dcomplex* x, int incx );

// --- scalv ---

void bl1_sscalv(  conj1_t conj, int n, float*    alpha, float*    x, int incx );
void bl1_dscalv(  conj1_t conj, int n, double*   alpha, double*   x, int incx );
void bl1_csscalv( conj1_t conj, int n, float*    alpha, scomplex* x, int incx );
void bl1_cscalv(  conj1_t conj, int n, scomplex* alpha, scomplex* x, int incx );
void bl1_zdscalv( conj1_t conj, int n, double*   alpha, dcomplex* x, int incx );
void bl1_zscalv(  conj1_t conj, int n, dcomplex* alpha, dcomplex* x, int incx );

// --- scalm ---

void bl1_sscalm(  conj1_t conj, int m, int n, float*    alpha, float*    a, int a_rs, int a_cs );
void bl1_dscalm(  conj1_t conj, int m, int n, double*   alpha, double*   a, int a_rs, int a_cs );
void bl1_csscalm( conj1_t conj, int m, int n, float*    alpha, scomplex* a, int a_rs, int a_cs );
void bl1_cscalm(  conj1_t conj, int m, int n, scomplex* alpha, scomplex* a, int a_rs, int a_cs );
void bl1_zdscalm( conj1_t conj, int m, int n, double*   alpha, dcomplex* a, int a_rs, int a_cs );
void bl1_zscalm(  conj1_t conj, int m, int n, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs );

// --- scalmr ---

void bl1_sscalmr(  uplo1_t uplo, int m, int n, float*    alpha, float*    a, int a_rs, int a_cs );
void bl1_dscalmr(  uplo1_t uplo, int m, int n, double*   alpha, double*   a, int a_rs, int a_cs );
void bl1_csscalmr( uplo1_t uplo, int m, int n, float*    alpha, scomplex* a, int a_rs, int a_cs );
void bl1_cscalmr(  uplo1_t uplo, int m, int n, scomplex* alpha, scomplex* a, int a_rs, int a_cs );
void bl1_zdscalmr( uplo1_t uplo, int m, int n, double*   alpha, dcomplex* a, int a_rs, int a_cs );
void bl1_zscalmr(  uplo1_t uplo, int m, int n, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs );

// --- swap ---

void bl1_sswap( int n, float*    x, int incx, float*    y, int incy );
void bl1_dswap( int n, double*   x, int incx, double*   y, int incy );
void bl1_cswap( int n, scomplex* x, int incx, scomplex* y, int incy );
void bl1_zswap( int n, dcomplex* x, int incx, dcomplex* y, int incy );

// --- swapv ---

void bl1_sswapv( int n, float*    x, int incx, float*    y, int incy );
void bl1_dswapv( int n, double*   x, int incx, double*   y, int incy );
void bl1_cswapv( int n, scomplex* x, int incx, scomplex* y, int incy );
void bl1_zswapv( int n, dcomplex* x, int incx, dcomplex* y, int incy );

// --- swapmt ---

void bl1_sswapmt( trans1_t trans, int m, int n, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_dswapmt( trans1_t trans, int m, int n, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_cswapmt( trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_zswapmt( trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );

// end blis_prototypes_level1.h
// begin blis_prototypes_level2.h


// --- Level-2 BLAS-like prototypes --------------------------------------------

// --- gemv ---

void bl1_sgemv( trans1_t transa, conj1_t conjx, int m, int n, float*    alpha, float*    a, int a_rs, int a_cs, float*    x, int incx, float*    beta, float*    y, int incy );
void bl1_dgemv( trans1_t transa, conj1_t conjx, int m, int n, double*   alpha, double*   a, int a_rs, int a_cs, double*   x, int incx, double*   beta, double*   y, int incy );
void bl1_cgemv( trans1_t transa, conj1_t conjx, int m, int n, scomplex* alpha, scomplex* a, int a_rs, int a_cs, scomplex* x, int incx, scomplex* beta, scomplex* y, int incy );
void bl1_zgemv( trans1_t transa, conj1_t conjx, int m, int n, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs, dcomplex* x, int incx, dcomplex* beta, dcomplex* y, int incy );

void bl1_sgemv_blas( trans1_t transa, int m, int n, float*    alpha, float*    a, int lda, float*    x, int incx, float*    beta, float*    y, int incy );
void bl1_dgemv_blas( trans1_t transa, int m, int n, double*   alpha, double*   a, int lda, double*   x, int incx, double*   beta, double*   y, int incy );
void bl1_cgemv_blas( trans1_t transa, int m, int n, scomplex* alpha, scomplex* a, int lda, scomplex* x, int incx, scomplex* beta, scomplex* y, int incy );
void bl1_zgemv_blas( trans1_t transa, int m, int n, dcomplex* alpha, dcomplex* a, int lda, dcomplex* x, int incx, dcomplex* beta, dcomplex* y, int incy );

// --- ger ---

void bl1_sger( conj1_t conjx, conj1_t conjy, int m, int n, float*    alpha, float*    x, int incx, float*    y, int incy, float*    a, int a_rs, int a_cs );
void bl1_dger( conj1_t conjx, conj1_t conjy, int m, int n, double*   alpha, double*   x, int incx, double*   y, int incy, double*   a, int a_rs, int a_cs );
void bl1_cger( conj1_t conjx, conj1_t conjy, int m, int n, scomplex* alpha, scomplex* x, int incx, scomplex* y, int incy, scomplex* a, int a_rs, int a_cs );
void bl1_zger( conj1_t conjx, conj1_t conjy, int m, int n, dcomplex* alpha, dcomplex* x, int incx, dcomplex* y, int incy, dcomplex* a, int a_rs, int a_cs );

void bl1_sger_blas(  int m, int n, float*    alpha, float*    x, int incx, float*    y, int incy, float*    a, int lda );
void bl1_dger_blas(  int m, int n, double*   alpha, double*   x, int incx, double*   y, int incy, double*   a, int lda );
void bl1_cgerc_blas( int m, int n, scomplex* alpha, scomplex* x, int incx, scomplex* y, int incy, scomplex* a, int lda );
void bl1_cgeru_blas( int m, int n, scomplex* alpha, scomplex* x, int incx, scomplex* y, int incy, scomplex* a, int lda );
void bl1_zgerc_blas( int m, int n, dcomplex* alpha, dcomplex* x, int incx, dcomplex* y, int incy, dcomplex* a, int lda );
void bl1_zgeru_blas( int m, int n, dcomplex* alpha, dcomplex* x, int incx, dcomplex* y, int incy, dcomplex* a, int lda );

// --- hemv ---

void bl1_shemv( uplo1_t uplo, conj1_t conj, int m, float*    alpha, float*    a, int a_rs, int a_cs, float*    x, int incx, float*    beta, float*    y, int incy );
void bl1_dhemv( uplo1_t uplo, conj1_t conj, int m, double*   alpha, double*   a, int a_rs, int a_cs, double*   x, int incx, double*   beta, double*   y, int incy );
void bl1_chemv( uplo1_t uplo, conj1_t conj, int m, scomplex* alpha, scomplex* a, int a_rs, int a_cs, scomplex* x, int incx, scomplex* beta, scomplex* y, int incy );
void bl1_zhemv( uplo1_t uplo, conj1_t conj, int m, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs, dcomplex* x, int incx, dcomplex* beta, dcomplex* y, int incy );

void bl1_chemv_blas( uplo1_t uplo, int m, scomplex* alpha, scomplex* a, int lda, scomplex* x, int incx, scomplex* beta, scomplex* y, int incy );
void bl1_zhemv_blas( uplo1_t uplo, int m, dcomplex* alpha, dcomplex* a, int lda, dcomplex* x, int incx, dcomplex* beta, dcomplex* y, int incy );

// --- her ---

void bl1_sher( uplo1_t uplo, conj1_t conj, int m, float*  alpha, float*    x, int incx, float*    a, int a_rs, int a_cs );
void bl1_dher( uplo1_t uplo, conj1_t conj, int m, double* alpha, double*   x, int incx, double*   a, int a_rs, int a_cs );
void bl1_cher( uplo1_t uplo, conj1_t conj, int m, float*  alpha, scomplex* x, int incx, scomplex* a, int a_rs, int a_cs );
void bl1_zher( uplo1_t uplo, conj1_t conj, int m, double* alpha, dcomplex* x, int incx, dcomplex* a, int a_rs, int a_cs );

void bl1_cher_blas( uplo1_t uplo, int m, float*  alpha, scomplex* x, int incx, scomplex* a, int lda );
void bl1_zher_blas( uplo1_t uplo, int m, double* alpha, dcomplex* x, int incx, dcomplex* a, int lda );

// --- her2 ---

void bl1_sher2( uplo1_t uplo, conj1_t conj, int m, float*    alpha, float*    x, int incx, float*    y, int incy, float*    a, int a_rs, int a_cs );
void bl1_dher2( uplo1_t uplo, conj1_t conj, int m, double*   alpha, double*   x, int incx, double*   y, int incy, double*   a, int a_rs, int a_cs );
void bl1_cher2( uplo1_t uplo, conj1_t conj, int m, scomplex* alpha, scomplex* x, int incx, scomplex* y, int incy, scomplex* a, int a_rs, int a_cs );
void bl1_zher2( uplo1_t uplo, conj1_t conj, int m, dcomplex* alpha, dcomplex* x, int incx, dcomplex* y, int incy, dcomplex* a, int a_rs, int a_cs );

void bl1_cher2_blas( uplo1_t uplo, int m, scomplex* alpha, scomplex* x, int incx, scomplex* y, int incy, scomplex* a, int lda );
void bl1_zher2_blas( uplo1_t uplo, int m, dcomplex* alpha, dcomplex* x, int incx, dcomplex* y, int incy, dcomplex* a, int lda );

// --- symv ---

void bl1_ssymv( uplo1_t uplo, int m, float*    alpha, float*    a, int a_rs, int a_cs, float*    x, int incx, float*    beta, float*    y, int incy );
void bl1_dsymv( uplo1_t uplo, int m, double*   alpha, double*   a, int a_rs, int a_cs, double*   x, int incx, double*   beta, double*   y, int incy );
void bl1_csymv( uplo1_t uplo, int m, scomplex* alpha, scomplex* a, int a_rs, int a_cs, scomplex* x, int incx, scomplex* beta, scomplex* y, int incy );
void bl1_zsymv( uplo1_t uplo, int m, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs, dcomplex* x, int incx, dcomplex* beta, dcomplex* y, int incy );

void bl1_ssymv_blas( uplo1_t uplo, int m, float*    alpha, float*    a, int lda, float*    x, int incx, float*    beta, float*    y, int incy );
void bl1_dsymv_blas( uplo1_t uplo, int m, double*   alpha, double*   a, int lda, double*   x, int incx, double*   beta, double*   y, int incy );
void bl1_csymv_blas( uplo1_t uplo, int m, scomplex* alpha, scomplex* a, int lda, scomplex* x, int incx, scomplex* beta, scomplex* y, int incy );
void bl1_zsymv_blas( uplo1_t uplo, int m, dcomplex* alpha, dcomplex* a, int lda, dcomplex* x, int incx, dcomplex* beta, dcomplex* y, int incy );

// --- syr ---

void bl1_ssyr( uplo1_t uplo, int m, float*    alpha, float*    x, int incx, float*    a, int a_rs, int a_cs );
void bl1_dsyr( uplo1_t uplo, int m, double*   alpha, double*   x, int incx, double*   a, int a_rs, int a_cs );
void bl1_csyr( uplo1_t uplo, int m, scomplex* alpha, scomplex* x, int incx, scomplex* a, int a_rs, int a_cs );
void bl1_zsyr( uplo1_t uplo, int m, dcomplex* alpha, dcomplex* x, int incx, dcomplex* a, int a_rs, int a_cs );

void bl1_ssyr_blas( uplo1_t uplo, int m, float*    alpha, float*    x, int incx, float*    a, int lda );
void bl1_dsyr_blas( uplo1_t uplo, int m, double*   alpha, double*   x, int incx, double*   a, int lda );
void bl1_csyr_blas( uplo1_t uplo, int m, scomplex* alpha, scomplex* x, int incx, scomplex* a, int lda );
void bl1_zsyr_blas( uplo1_t uplo, int m, dcomplex* alpha, dcomplex* x, int incx, dcomplex* a, int lda );

// --- syr2 ---

void bl1_ssyr2( uplo1_t uplo, int m, float*    alpha, float*    x, int incx, float*    y, int incy, float*    a, int a_rs, int a_cs );
void bl1_dsyr2( uplo1_t uplo, int m, double*   alpha, double*   x, int incx, double*   y, int incy, double*   a, int a_rs, int a_cs );
void bl1_csyr2( uplo1_t uplo, int m, scomplex* alpha, scomplex* x, int incx, scomplex* y, int incy, scomplex* a, int a_rs, int a_cs );
void bl1_zsyr2( uplo1_t uplo, int m, dcomplex* alpha, dcomplex* x, int incx, dcomplex* y, int incy, dcomplex* a, int a_rs, int a_cs );

void bl1_ssyr2_blas( uplo1_t uplo, int m, float*    alpha, float*    x, int incx, float*    y, int incy, float*    a, int lda );
void bl1_dsyr2_blas( uplo1_t uplo, int m, double*   alpha, double*   x, int incx, double*   y, int incy, double*   a, int lda );
void bl1_csyr2_blas( uplo1_t uplo, int m, scomplex* alpha, scomplex* x, int incx, scomplex* y, int incy, scomplex* a, int lda );
void bl1_zsyr2_blas( uplo1_t uplo, int m, dcomplex* alpha, dcomplex* x, int incx, dcomplex* y, int incy, dcomplex* a, int lda );

// --- trmv ---

void bl1_strmv( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, float*    a, int a_rs, int a_cs, float*    x, int incx );
void bl1_dtrmv( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, double*   a, int a_rs, int a_cs, double*   x, int incx );
void bl1_ctrmv( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, scomplex* a, int a_rs, int a_cs, scomplex* x, int incx );
void bl1_ztrmv( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, dcomplex* a, int a_rs, int a_cs, dcomplex* x, int incx );

void bl1_strmv_blas( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, float*    a, int lda, float*    x, int incx );
void bl1_dtrmv_blas( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, double*   a, int lda, double*   x, int incx );
void bl1_ctrmv_blas( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, scomplex* a, int lda, scomplex* x, int incx );
void bl1_ztrmv_blas( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, dcomplex* a, int lda, dcomplex* x, int incx );

// --- trsv ---

void bl1_strsv( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, float*    a, int a_rs, int a_cs, float*    x, int incx );
void bl1_dtrsv( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, double*   a, int a_rs, int a_cs, double*   x, int incx );
void bl1_ctrsv( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, scomplex* a, int a_rs, int a_cs, scomplex* x, int incx );
void bl1_ztrsv( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, dcomplex* a, int a_rs, int a_cs, dcomplex* x, int incx );

void bl1_strsv_blas( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, float*    a, int lda, float*    x, int incx );
void bl1_dtrsv_blas( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, double*   a, int lda, double*   x, int incx );
void bl1_ctrsv_blas( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, scomplex* a, int lda, scomplex* x, int incx );
void bl1_ztrsv_blas( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, dcomplex* a, int lda, dcomplex* x, int incx );

// --- trmvsx ---

void bl1_strmvsx( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, float* alpha, float* a, int a_rs, int a_cs, float* x, int incx, float* beta, float* y, int incy );
void bl1_dtrmvsx( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, double* alpha, double* a, int a_rs, int a_cs, double* x, int incx, double* beta, double* y, int incy );
void bl1_ctrmvsx( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, scomplex* alpha, scomplex* a, int a_rs, int a_cs, scomplex* x, int incx, scomplex* beta, scomplex* y, int incy );
void bl1_ztrmvsx( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs, dcomplex* x, int incx, dcomplex* beta, dcomplex* y, int incy );

// --- trsvsx ---

void bl1_strsvsx( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, float* alpha, float* a, int a_rs, int a_cs, float* x, int incx, float* beta, float* y, int incy );
void bl1_dtrsvsx( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, double* alpha, double* a, int a_rs, int a_cs, double* x, int incx, double* beta, double* y, int incy );
void bl1_ctrsvsx( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, scomplex* alpha, scomplex* a, int a_rs, int a_cs, scomplex* x, int incx, scomplex* beta, scomplex* y, int incy );
void bl1_ztrsvsx( uplo1_t uplo, trans1_t trans, diag1_t diag, int m, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs, dcomplex* x, int incx, dcomplex* beta, dcomplex* y, int incy );

// end blis_prototypes_level2.h
// begin blis_prototypes_level3.h


// --- Level-3 BLAS-like prototypes --------------------------------------------

// --- gemm ---

void bl1_sgemm( trans1_t transa, trans1_t transb, int m, int k, int n, float*    alpha, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs, float*    beta, float*    c, int c_rs, int c_cs );
void bl1_dgemm( trans1_t transa, trans1_t transb, int m, int k, int n, double*   alpha, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs, double*   beta, double*   c, int c_rs, int c_cs );
void bl1_cgemm( trans1_t transa, trans1_t transb, int m, int k, int n, scomplex* alpha, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs, scomplex* beta, scomplex* c, int c_rs, int c_cs );
void bl1_zgemm( trans1_t transa, trans1_t transb, int m, int k, int n, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs, dcomplex* beta, dcomplex* c, int c_rs, int c_cs );

void bl1_sgemm_blas( trans1_t transa, trans1_t transb, int m, int n, int k, float*    alpha, float*    a, int lda, float*    b, int ldb, float*    beta, float*    c, int ldc );
void bl1_dgemm_blas( trans1_t transa, trans1_t transb, int m, int n, int k, double*   alpha, double*   a, int lda, double*   b, int ldb, double*   beta, double*   c, int ldc );
void bl1_cgemm_blas( trans1_t transa, trans1_t transb, int m, int n, int k, scomplex* alpha, scomplex* a, int lda, scomplex* b, int ldb, scomplex* beta, scomplex* c, int ldc );
void bl1_zgemm_blas( trans1_t transa, trans1_t transb, int m, int n, int k, dcomplex* alpha, dcomplex* a, int lda, dcomplex* b, int ldb, dcomplex* beta, dcomplex* c, int ldc );

// --- hemm ---

void bl1_shemm( side1_t side, uplo1_t uplo, int m, int n, float*    alpha, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs, float*    beta, float*    c, int c_rs, int c_cs );
void bl1_dhemm( side1_t side, uplo1_t uplo, int m, int n, double*   alpha, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs, double*   beta, double*   c, int c_rs, int c_cs );
void bl1_chemm( side1_t side, uplo1_t uplo, int m, int n, scomplex* alpha, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs, scomplex* beta, scomplex* c, int c_rs, int c_cs );
void bl1_zhemm( side1_t side, uplo1_t uplo, int m, int n, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs, dcomplex* beta, dcomplex* c, int c_rs, int c_cs );

void bl1_chemm_blas( side1_t side, uplo1_t uplo, int m, int n, scomplex* alpha, scomplex* a, int lda, scomplex* b, int ldb, scomplex* beta, scomplex* c, int ldc );
void bl1_zhemm_blas( side1_t side, uplo1_t uplo, int m, int n, dcomplex* alpha, dcomplex* a, int lda, dcomplex* b, int ldb, dcomplex* beta, dcomplex* c, int ldc );

// --- herk ---

void bl1_sherk( uplo1_t uplo, trans1_t trans, int m, int k, float*  alpha, float*    a, int a_rs, int a_cs, float*  beta, float*    c, int c_rs, int c_cs );
void bl1_dherk( uplo1_t uplo, trans1_t trans, int m, int k, double* alpha, double*   a, int a_rs, int a_cs, double* beta, double*   c, int c_rs, int c_cs );
void bl1_cherk( uplo1_t uplo, trans1_t trans, int m, int k, float*  alpha, scomplex* a, int a_rs, int a_cs, float*  beta, scomplex* c, int c_rs, int c_cs );
void bl1_zherk( uplo1_t uplo, trans1_t trans, int m, int k, double* alpha, dcomplex* a, int a_rs, int a_cs, double* beta, dcomplex* c, int c_rs, int c_cs );

void bl1_cherk_blas( uplo1_t uplo, trans1_t trans, int m, int k, float*  alpha, scomplex* a, int lda, float*  beta, scomplex* c, int ldc );
void bl1_zherk_blas( uplo1_t uplo, trans1_t trans, int m, int k, double* alpha, dcomplex* a, int lda, double* beta, dcomplex* c, int ldc );

// --- her2k ---

void bl1_sher2k( uplo1_t uplo, trans1_t trans, int m, int k, float*    alpha, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs, float*  beta, float*    c, int c_rs, int c_cs );
void bl1_dher2k( uplo1_t uplo, trans1_t trans, int m, int k, double*   alpha, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs, double* beta, double*   c, int c_rs, int c_cs );
void bl1_cher2k( uplo1_t uplo, trans1_t trans, int m, int k, scomplex* alpha, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs, float*  beta, scomplex* c, int c_rs, int c_cs );
void bl1_zher2k( uplo1_t uplo, trans1_t trans, int m, int k, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs, double* beta, dcomplex* c, int c_rs, int c_cs );

void bl1_cher2k_blas( uplo1_t uplo, trans1_t trans, int m, int k, scomplex* alpha, scomplex* a, int lda, scomplex* b, int ldb, float*  beta, scomplex* c, int ldc );
void bl1_zher2k_blas( uplo1_t uplo, trans1_t trans, int m, int k, dcomplex* alpha, dcomplex* a, int lda, dcomplex* b, int ldb, double* beta, dcomplex* c, int ldc );

// --- symm ---

void bl1_ssymm( side1_t side, uplo1_t uplo, int m, int n, float*    alpha, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs, float*    beta, float*    c, int c_rs, int c_cs );
void bl1_dsymm( side1_t side, uplo1_t uplo, int m, int n, double*   alpha, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs, double*   beta, double*   c, int c_rs, int c_cs );
void bl1_csymm( side1_t side, uplo1_t uplo, int m, int n, scomplex* alpha, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs, scomplex* beta, scomplex* c, int c_rs, int c_cs );
void bl1_zsymm( side1_t side, uplo1_t uplo, int m, int n, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs, dcomplex* beta, dcomplex* c, int c_rs, int c_cs );

void bl1_ssymm_blas( side1_t side, uplo1_t uplo, int m, int n, float*    alpha, float*    a, int lda, float*    b, int ldb, float*    beta, float*    c, int ldc );
void bl1_dsymm_blas( side1_t side, uplo1_t uplo, int m, int n, double*   alpha, double*   a, int lda, double*   b, int ldb, double*   beta, double*   c, int ldc );
void bl1_csymm_blas( side1_t side, uplo1_t uplo, int m, int n, scomplex* alpha, scomplex* a, int lda, scomplex* b, int ldb, scomplex* beta, scomplex* c, int ldc );
void bl1_zsymm_blas( side1_t side, uplo1_t uplo, int m, int n, dcomplex* alpha, dcomplex* a, int lda, dcomplex* b, int ldb, dcomplex* beta, dcomplex* c, int ldc );

// --- syrk ---

void bl1_ssyrk( uplo1_t uplo, trans1_t trans, int m, int k, float*    alpha, float*    a, int a_rs, int a_cs, float*    beta, float*    c, int c_rs, int c_cs );
void bl1_dsyrk( uplo1_t uplo, trans1_t trans, int m, int k, double*   alpha, double*   a, int a_rs, int a_cs, double*   beta, double*   c, int c_rs, int c_cs );
void bl1_csyrk( uplo1_t uplo, trans1_t trans, int m, int k, scomplex* alpha, scomplex* a, int a_rs, int a_cs, scomplex* beta, scomplex* c, int c_rs, int c_cs );
void bl1_zsyrk( uplo1_t uplo, trans1_t trans, int m, int k, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs, dcomplex* beta, dcomplex* c, int c_rs, int c_cs );

void bl1_ssyrk_blas( uplo1_t uplo, trans1_t trans, int m, int k, float*    alpha, float*    a, int lda, float*    beta, float*    c, int ldc );
void bl1_dsyrk_blas( uplo1_t uplo, trans1_t trans, int m, int k, double*   alpha, double*   a, int lda, double*   beta, double*   c, int ldc );
void bl1_csyrk_blas( uplo1_t uplo, trans1_t trans, int m, int k, scomplex* alpha, scomplex* a, int lda, scomplex* beta, scomplex* c, int ldc );
void bl1_zsyrk_blas( uplo1_t uplo, trans1_t trans, int m, int k, dcomplex* alpha, dcomplex* a, int lda, dcomplex* beta, dcomplex* c, int ldc );

// --- syr2k ---

void bl1_ssyr2k( uplo1_t uplo, trans1_t trans, int m, int k, float*    alpha, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs, float*    beta, float*    c, int c_rs, int c_cs );
void bl1_dsyr2k( uplo1_t uplo, trans1_t trans, int m, int k, double*   alpha, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs, double*   beta, double*   c, int c_rs, int c_cs );
void bl1_csyr2k( uplo1_t uplo, trans1_t trans, int m, int k, scomplex* alpha, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs, scomplex* beta, scomplex* c, int c_rs, int c_cs );
void bl1_zsyr2k( uplo1_t uplo, trans1_t trans, int m, int k, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs, dcomplex* beta, dcomplex* c, int c_rs, int c_cs );

void bl1_ssyr2k_blas( uplo1_t uplo, trans1_t trans, int m, int k, float*    alpha, float*    a, int lda, float*    b, int ldb, float*    beta, float*    c, int ldc );
void bl1_dsyr2k_blas( uplo1_t uplo, trans1_t trans, int m, int k, double*   alpha, double*   a, int lda, double*   b, int ldb, double*   beta, double*   c, int ldc );
void bl1_csyr2k_blas( uplo1_t uplo, trans1_t trans, int m, int k, scomplex* alpha, scomplex* a, int lda, scomplex* b, int ldb, scomplex* beta, scomplex* c, int ldc );
void bl1_zsyr2k_blas( uplo1_t uplo, trans1_t trans, int m, int k, dcomplex* alpha, dcomplex* a, int lda, dcomplex* b, int ldb, dcomplex* beta, dcomplex* c, int ldc );

// --- trmm ---

void bl1_strmm( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, float*    alpha, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_dtrmm( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, double*   alpha, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_ctrmm( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, scomplex* alpha, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_ztrmm( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );

void bl1_strmm_blas( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, float*    alpha, float*    a, int lda, float*    b, int ldb );
void bl1_dtrmm_blas( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, double*   alpha, double*   a, int lda, double*   b, int ldb );
void bl1_ctrmm_blas( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, scomplex* alpha, scomplex* a, int lda, scomplex* b, int ldb );
void bl1_ztrmm_blas( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, dcomplex* alpha, dcomplex* a, int lda, dcomplex* b, int ldb );

// --- trsm ---

void bl1_strsm( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, float*    alpha, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs );
void bl1_dtrsm( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, double*   alpha, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs );
void bl1_ctrsm( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, scomplex* alpha, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs );
void bl1_ztrsm( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs );

void bl1_strsm_blas( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, float*    alpha, float*    a, int lda, float*    b, int ldb );
void bl1_dtrsm_blas( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, double*   alpha, double*   a, int lda, double*   b, int ldb );
void bl1_ctrsm_blas( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, scomplex* alpha, scomplex* a, int lda, scomplex* b, int ldb );
void bl1_ztrsm_blas( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, dcomplex* alpha, dcomplex* a, int lda, dcomplex* b, int ldb );

// --- trmmsx ---

void bl1_strmmsx( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, float*    alpha, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs, float*    beta, float*    c, int c_rs, int c_cs );
void bl1_dtrmmsx( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, double*   alpha, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs, double*   beta, double*   c, int c_rs, int c_cs );
void bl1_ctrmmsx( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, scomplex* alpha, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs, scomplex* beta, scomplex* c, int c_rs, int c_cs );
void bl1_ztrmmsx( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs, dcomplex* beta, dcomplex* c, int c_rs, int c_cs );

// --- trsmsx ---

void bl1_strsmsx( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, float*    alpha, float*    a, int a_rs, int a_cs, float*    b, int b_rs, int b_cs, float*    beta, float*    c, int c_rs, int c_cs );
void bl1_dtrsmsx( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, double*   alpha, double*   a, int a_rs, int a_cs, double*   b, int b_rs, int b_cs, double*   beta, double*   c, int c_rs, int c_cs );
void bl1_ctrsmsx( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, scomplex* alpha, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs, scomplex* beta, scomplex* c, int c_rs, int c_cs );
void bl1_ztrsmsx( side1_t side, uplo1_t uplo, trans1_t trans, diag1_t diag, int m, int n, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs, dcomplex* beta, dcomplex* c, int c_rs, int c_cs );

// end blis_prototypes_level3.h

// begin blis_prototypes_fused1.h


// --- Fused Level-1 BLAS-like prototypes --------------------------------------

// --- axmyv2 ---

void bl1_saxmyv2( conj1_t conjx, int n, float*    alpha, float*    beta, float*    x, int inc_x, float*    y, int inc_y, float*    z, int inc_z );
void bl1_daxmyv2( conj1_t conjx, int n, double*   alpha, double*   beta, double*   x, int inc_x, double*   y, int inc_y, double*   z, int inc_z );
void bl1_caxmyv2( conj1_t conjx, int n, scomplex* alpha, scomplex* beta, scomplex* x, int inc_x, scomplex* y, int inc_y, scomplex* z, int inc_z );
void bl1_zaxmyv2( conj1_t conjx, int n, dcomplex* alpha, dcomplex* beta, dcomplex* x, int inc_x, dcomplex* y, int inc_y, dcomplex* z, int inc_z );

// --- axpyv2b ---

void bl1_saxpyv2b( int n, float*    beta1, float*    beta2, float*    a1, int inc_a1, float*    a2, int inc_a2, float*    w, int inc_w );
void bl1_daxpyv2b( int n, double*   beta1, double*   beta2, double*   a1, int inc_a1, double*   a2, int inc_a2, double*   w, int inc_w );
void bl1_caxpyv2b( int n, scomplex* beta1, scomplex* beta2, scomplex* a1, int inc_a1, scomplex* a2, int inc_a2, scomplex* w, int inc_w );
void bl1_zaxpyv2b( int n, dcomplex* beta1, dcomplex* beta2, dcomplex* a1, int inc_a1, dcomplex* a2, int inc_a2, dcomplex* w, int inc_w );

// --- axpyv3b ---

void bl1_saxpyv3b( int n, float*    beta1, float*    beta2, float*    beta3, float*    a1, int inc_a1, float*    a2, int inc_a2, float*    a3, int inc_a3, float*    w, int inc_w );
void bl1_daxpyv3b( int n, double*   beta1, double*   beta2, double*   beta3, double*   a1, int inc_a1, double*   a2, int inc_a2, double*   a3, int inc_a3, double*   w, int inc_w );
void bl1_caxpyv3b( int n, scomplex* beta1, scomplex* beta2, scomplex* beta3, scomplex* a1, int inc_a1, scomplex* a2, int inc_a2, scomplex* a3, int inc_a3, scomplex* w, int inc_w );
void bl1_zaxpyv3b( int n, dcomplex* beta1, dcomplex* beta2, dcomplex* beta3, dcomplex* a1, int inc_a1, dcomplex* a2, int inc_a2, dcomplex* a3, int inc_a3, dcomplex* w, int inc_w );

// --- axpyv2bdotaxpy ---

void bl1_saxpyv2bdotaxpy( int n, float*    beta, float*    u, int inc_u, float*    gamma, float*    z, int inc_z, float*    a, int inc_a, float*    x, int inc_x, float*    kappa, float*    rho, float*    w, int inc_w );
void bl1_daxpyv2bdotaxpy( int n, double*   beta, double*   u, int inc_u, double*   gamma, double*   z, int inc_z, double*   a, int inc_a, double*   x, int inc_x, double*   kappa, double*   rho, double*   w, int inc_w );
void bl1_caxpyv2bdotaxpy( int n, scomplex* beta, scomplex* u, int inc_u, scomplex* gamma, scomplex* z, int inc_z, scomplex* a, int inc_a, scomplex* x, int inc_x, scomplex* kappa, scomplex* rho, scomplex* w, int inc_w );
void bl1_zaxpyv2bdotaxpy( int n, dcomplex* beta, dcomplex* u, int inc_u, dcomplex* gamma, dcomplex* z, int inc_z, dcomplex* a, int inc_a, dcomplex* x, int inc_x, dcomplex* kappa, dcomplex* rho, dcomplex* w, int inc_w );

// --- dotsv2 ---

void bl1_sdotsv2( conj1_t conjxy, int n, float*    x, int inc_x, float*    y, int inc_y, float*    z, int inc_z, float*    beta, float*    rho_xz, float*    rho_yz );
void bl1_ddotsv2( conj1_t conjxy, int n, double*   x, int inc_x, double*   y, int inc_y, double*   z, int inc_z, double*   beta, double*   rho_xz, double*   rho_yz );
void bl1_cdotsv2( conj1_t conjxy, int n, scomplex* x, int inc_x, scomplex* y, int inc_y, scomplex* z, int inc_z, scomplex* beta, scomplex* rho_xz, scomplex* rho_yz );
void bl1_zdotsv2( conj1_t conjxy, int n, dcomplex* x, int inc_x, dcomplex* y, int inc_y, dcomplex* z, int inc_z, dcomplex* beta, dcomplex* rho_xz, dcomplex* rho_yz );

// --- dotsv3 ---

void bl1_sdotsv3( conj1_t conjxyw, int n, float*    x, int inc_x, float*    y, int inc_y, float*    w, int inc_w, float*    z, int inc_z, float*    beta, float*    rho_xz, float*    rho_yz, float*    rho_wz );
void bl1_ddotsv3( conj1_t conjxyw, int n, double*   x, int inc_x, double*   y, int inc_y, double*   w, int inc_w, double*   z, int inc_z, double*   beta, double*   rho_xz, double*   rho_yz, double*   rho_wz );
void bl1_cdotsv3( conj1_t conjxyw, int n, scomplex* x, int inc_x, scomplex* y, int inc_y, scomplex* w, int inc_w, scomplex* z, int inc_z, scomplex* beta, scomplex* rho_xz, scomplex* rho_yz, scomplex* rho_wz );
void bl1_zdotsv3( conj1_t conjxyw, int n, dcomplex* x, int inc_x, dcomplex* y, int inc_y, dcomplex* w, int inc_w, dcomplex* z, int inc_z, dcomplex* beta, dcomplex* rho_xz, dcomplex* rho_yz, dcomplex* rho_wz );

// --- dotaxpy ---

void bl1_sdotaxpy( int n, float*    a, int inc_a, float*    x, int inc_x, float*    kappa, float*    rho, float*    w, int inc_w );
void bl1_ddotaxpy( int n, double*   a, int inc_a, double*   x, int inc_x, double*   kappa, double*   rho, double*   w, int inc_w );
void bl1_cdotaxpy( int n, scomplex* a, int inc_a, scomplex* x, int inc_x, scomplex* kappa, scomplex* rho, scomplex* w, int inc_w );
void bl1_zdotaxpy( int n, dcomplex* a, int inc_a, dcomplex* x, int inc_x, dcomplex* kappa, dcomplex* rho, dcomplex* w, int inc_w );

// --- dotaxmyv2 ---

void bl1_sdotaxmyv2( int n, float*    alpha, float*    beta, float*    x, int inc_x, float*    u, int inc_u, float*    rho, float*    y, int inc_y, float*    z, int inc_z );
void bl1_ddotaxmyv2( int n, double*   alpha, double*   beta, double*   x, int inc_x, double*   u, int inc_u, double*   rho, double*   y, int inc_y, double*   z, int inc_z );
void bl1_cdotaxmyv2( int n, scomplex* alpha, scomplex* beta, scomplex* x, int inc_x, scomplex* u, int inc_u, scomplex* rho, scomplex* y, int inc_y, scomplex* z, int inc_z );
void bl1_zdotaxmyv2( int n, dcomplex* alpha, dcomplex* beta, dcomplex* x, int inc_x, dcomplex* u, int inc_u, dcomplex* rho, dcomplex* y, int inc_y, dcomplex* z, int inc_z );

// --- dotv2axpyv2b ---

void bl1_sdotv2axpyv2b( int n, float*    a1, int inc_a1, float*    a2, int inc_a2, float*    x,  int inc_x, float*    kappa1, float*    kappa2, float*    rho1, float*    rho2, float*    w, int inc_w );
void bl1_ddotv2axpyv2b( int n, double*   a1, int inc_a1, double*   a2, int inc_a2, double*   x,  int inc_x, double*   kappa1, double*   kappa2, double*   rho1, double*   rho2, double*   w, int inc_w );
void bl1_cdotv2axpyv2b( int n, scomplex* a1, int inc_a1, scomplex* a2, int inc_a2, scomplex* x,  int inc_x, scomplex* kappa1, scomplex* kappa2, scomplex* rho1, scomplex* rho2, scomplex* w, int inc_w );
void bl1_zdotv2axpyv2b( int n, dcomplex* a1, int inc_a1, dcomplex* a2, int inc_a2, dcomplex* x,  int inc_x, dcomplex* kappa1, dcomplex* kappa2, dcomplex* rho1, dcomplex* rho2, dcomplex* w, int inc_w );

// --- axpyv2bdots ---

void bl1_zaxpyv2bdots( int       n,
                       dcomplex* alpha1,
                       dcomplex* alpha2,
                       dcomplex* x1, int inc_x1,
                       dcomplex* x2, int inc_x2,
                       dcomplex* y,  int inc_y,
                       dcomplex* u,  int inc_u,
                       dcomplex* beta,
                       dcomplex* rho );
// end blis_prototypes_fused1.h

// begin blis_f77_name_mangling.h


// --- Define Fortran name-mangling macro --------------------------------------

// If the F77_FUNC name-mangling macro is undefined, then we we need to define
// it ourselves.
#ifndef F77_FUNC

  // Case 1: F77_FUNC is undefined because we're building for Windows.
  #ifdef BLIS1_ENABLE_WINDOWS_BUILD

    // Check whether we need to use uppercase Fortran routine names; otherwise
    // default to lowercase.
    #ifdef BLIS1_ENABLE_UPPERCASE_F77

      // Use uppercase routine names (no underscore).
      #define F77_FUNC( name_lower, name_upper ) \
              name_upper
    #else

      // Use lowercase routine names (no underscore).
      #define F77_FUNC( name_lower, name_upper ) \
              name_lower
    #endif

  // Case 2: F77_FUNC is undefined because we're in a Linux-like environment
  // that did not define it for us.
  #else

    // Check whether we need to use uppercase Fortran routine names; otherwise
    // default to lowercase.
    #ifdef BLIS1_ENABLE_UPPERCASE_F77

      // Use uppercase routine names (single underscore).
      #define F77_FUNC( name_lower, name_upper ) \
              name_upper ## _
    #else

      // Use lowercase routine names (single underscore).
      #define F77_FUNC( name_lower, name_upper ) \
              name_lower ## _
    #endif

  #endif // #ifdef BLIS1_ENABLE_WINDOWS_BUILD

#endif // #ifndef F77_FUNC

// end blis_f77_name_mangling.h

#ifdef BLIS1_ENABLE_CBLAS_INTERFACES
// begin blis_prototypes_cblas.h


#include <stddef.h> // skipped


#define CBLAS_INDEX size_t  
enum CBLAS_ORDER     {CblasRowMajor=101, CblasColMajor=102};
enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113};
enum CBLAS_UPLO      {CblasUpper=121, CblasLower=122};
enum CBLAS_DIAG      {CblasNonUnit=131, CblasUnit=132};
enum CBLAS_SIDE      {CblasLeft=141, CblasRight=142};


float  cblas_sdsdot(const int N, const float alpha, const float *X,
                    const int incX, const float *Y, const int incY);
double cblas_dsdot(const int N, const float *X, const int incX, const float *Y,
                   const int incY);
float  cblas_sdot(const int N, const float  *X, const int incX,
                  const float  *Y, const int incY);
double cblas_ddot(const int N, const double *X, const int incX,
                  const double *Y, const int incY);


void   cblas_cdotu_sub(const int N, const void *X, const int incX,
                       const void *Y, const int incY, void *dotu);
void   cblas_cdotc_sub(const int N, const void *X, const int incX,
                       const void *Y, const int incY, void *dotc);

void   cblas_zdotu_sub(const int N, const void *X, const int incX,
                       const void *Y, const int incY, void *dotu);
void   cblas_zdotc_sub(const int N, const void *X, const int incX,
                       const void *Y, const int incY, void *dotc);



float  cblas_snrm2(const int N, const float *X, const int incX);
float  cblas_sasum(const int N, const float *X, const int incX);

double cblas_dnrm2(const int N, const double *X, const int incX);
double cblas_dasum(const int N, const double *X, const int incX);

float  cblas_scnrm2(const int N, const void *X, const int incX);
float  cblas_scasum(const int N, const void *X, const int incX);

double cblas_dznrm2(const int N, const void *X, const int incX);
double cblas_dzasum(const int N, const void *X, const int incX);



CBLAS_INDEX cblas_isamax(const int N, const float  *X, const int incX);
CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX);
CBLAS_INDEX cblas_icamax(const int N, const void   *X, const int incX);
CBLAS_INDEX cblas_izamax(const int N, const void   *X, const int incX);




void cblas_sswap(const int N, float *X, const int incX, 
                 float *Y, const int incY);
void cblas_scopy(const int N, const float *X, const int incX, 
                 float *Y, const int incY);
void cblas_saxpy(const int N, const float alpha, const float *X,
                 const int incX, float *Y, const int incY);

void cblas_dswap(const int N, double *X, const int incX, 
                 double *Y, const int incY);
void cblas_dcopy(const int N, const double *X, const int incX, 
                 double *Y, const int incY);
void cblas_daxpy(const int N, const double alpha, const double *X,
                 const int incX, double *Y, const int incY);

void cblas_cswap(const int N, void *X, const int incX, 
                 void *Y, const int incY);
void cblas_ccopy(const int N, const void *X, const int incX, 
                 void *Y, const int incY);
void cblas_caxpy(const int N, const void *alpha, const void *X,
                 const int incX, void *Y, const int incY);

void cblas_zswap(const int N, void *X, const int incX, 
                 void *Y, const int incY);
void cblas_zcopy(const int N, const void *X, const int incX, 
                 void *Y, const int incY);
void cblas_zaxpy(const int N, const void *alpha, const void *X,
                 const int incX, void *Y, const int incY);



void cblas_srotg(float *a, float *b, float *c, float *s);
void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
void cblas_srot(const int N, float *X, const int incX,
                float *Y, const int incY, const float c, const float s);
void cblas_srotm(const int N, float *X, const int incX,
                float *Y, const int incY, const float *P);

void cblas_drotg(double *a, double *b, double *c, double *s);
void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
void cblas_drot(const int N, double *X, const int incX,
                double *Y, const int incY, const double c, const double s);
void cblas_drotm(const int N, double *X, const int incX,
                double *Y, const int incY, const double *P);



void cblas_sscal(const int N, const float alpha, float *X, const int incX);
void cblas_dscal(const int N, const double alpha, double *X, const int incX);
void cblas_cscal(const int N, const void *alpha, void *X, const int incX);
void cblas_zscal(const int N, const void *alpha, void *X, const int incX);
void cblas_csscal(const int N, const float alpha, void *X, const int incX);
void cblas_zdscal(const int N, const double alpha, void *X, const int incX);




void cblas_sgemv(const enum CBLAS_ORDER order,
                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
                 const float alpha, const float *A, const int lda,
                 const float *X, const int incX, const float beta,
                 float *Y, const int incY);
void cblas_sgbmv(const enum CBLAS_ORDER order,
                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
                 const int KL, const int KU, const float alpha,
                 const float *A, const int lda, const float *X,
                 const int incX, const float beta, float *Y, const int incY);
void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const float *A, const int lda, 
                 float *X, const int incX);
void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const int K, const float *A, const int lda, 
                 float *X, const int incX);
void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const float *Ap, float *X, const int incX);
void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const float *A, const int lda, float *X,
                 const int incX);
void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const int K, const float *A, const int lda,
                 float *X, const int incX);
void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const float *Ap, float *X, const int incX);

void cblas_dgemv(const enum CBLAS_ORDER order,
                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
                 const double alpha, const double *A, const int lda,
                 const double *X, const int incX, const double beta,
                 double *Y, const int incY);
void cblas_dgbmv(const enum CBLAS_ORDER order,
                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
                 const int KL, const int KU, const double alpha,
                 const double *A, const int lda, const double *X,
                 const int incX, const double beta, double *Y, const int incY);
void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const double *A, const int lda, 
                 double *X, const int incX);
void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const int K, const double *A, const int lda, 
                 double *X, const int incX);
void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const double *Ap, double *X, const int incX);
void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const double *A, const int lda, double *X,
                 const int incX);
void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const int K, const double *A, const int lda,
                 double *X, const int incX);
void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const double *Ap, double *X, const int incX);

void cblas_cgemv(const enum CBLAS_ORDER order,
                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
                 const void *alpha, const void *A, const int lda,
                 const void *X, const int incX, const void *beta,
                 void *Y, const int incY);
void cblas_cgbmv(const enum CBLAS_ORDER order,
                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
                 const int KL, const int KU, const void *alpha,
                 const void *A, const int lda, const void *X,
                 const int incX, const void *beta, void *Y, const int incY);
void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const void *A, const int lda, 
                 void *X, const int incX);
void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const int K, const void *A, const int lda, 
                 void *X, const int incX);
void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const void *Ap, void *X, const int incX);
void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const void *A, const int lda, void *X,
                 const int incX);
void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const int K, const void *A, const int lda,
                 void *X, const int incX);
void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const void *Ap, void *X, const int incX);

void cblas_zgemv(const enum CBLAS_ORDER order,
                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
                 const void *alpha, const void *A, const int lda,
                 const void *X, const int incX, const void *beta,
                 void *Y, const int incY);
void cblas_zgbmv(const enum CBLAS_ORDER order,
                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
                 const int KL, const int KU, const void *alpha,
                 const void *A, const int lda, const void *X,
                 const int incX, const void *beta, void *Y, const int incY);
void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const void *A, const int lda, 
                 void *X, const int incX);
void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const int K, const void *A, const int lda, 
                 void *X, const int incX);
void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const void *Ap, void *X, const int incX);
void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const void *A, const int lda, void *X,
                 const int incX);
void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const int K, const void *A, const int lda,
                 void *X, const int incX);
void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
                 const int N, const void *Ap, void *X, const int incX);



void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const int N, const float alpha, const float *A,
                 const int lda, const float *X, const int incX,
                 const float beta, float *Y, const int incY);
void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const int N, const int K, const float alpha, const float *A,
                 const int lda, const float *X, const int incX,
                 const float beta, float *Y, const int incY);
void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const int N, const float alpha, const float *Ap,
                 const float *X, const int incX,
                 const float beta, float *Y, const int incY);
void cblas_sger(const enum CBLAS_ORDER order, const int M, const int N,
                const float alpha, const float *X, const int incX,
                const float *Y, const int incY, float *A, const int lda);
void cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                const int N, const float alpha, const float *X,
                const int incX, float *A, const int lda);
void cblas_sspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                const int N, const float alpha, const float *X,
                const int incX, float *Ap);
void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                const int N, const float alpha, const float *X,
                const int incX, const float *Y, const int incY, float *A,
                const int lda);
void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                const int N, const float alpha, const float *X,
                const int incX, const float *Y, const int incY, float *A);

void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const int N, const double alpha, const double *A,
                 const int lda, const double *X, const int incX,
                 const double beta, double *Y, const int incY);
void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const int N, const int K, const double alpha, const double *A,
                 const int lda, const double *X, const int incX,
                 const double beta, double *Y, const int incY);
void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const int N, const double alpha, const double *Ap,
                 const double *X, const int incX,
                 const double beta, double *Y, const int incY);
void cblas_dger(const enum CBLAS_ORDER order, const int M, const int N,
                const double alpha, const double *X, const int incX,
                const double *Y, const int incY, double *A, const int lda);
void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                const int N, const double alpha, const double *X,
                const int incX, double *A, const int lda);
void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                const int N, const double alpha, const double *X,
                const int incX, double *Ap);
void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                const int N, const double alpha, const double *X,
                const int incX, const double *Y, const int incY, double *A,
                const int lda);
void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                const int N, const double alpha, const double *X,
                const int incX, const double *Y, const int incY, double *A);



void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const int N, const void *alpha, const void *A,
                 const int lda, const void *X, const int incX,
                 const void *beta, void *Y, const int incY);
void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const int N, const int K, const void *alpha, const void *A,
                 const int lda, const void *X, const int incX,
                 const void *beta, void *Y, const int incY);
void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const int N, const void *alpha, const void *Ap,
                 const void *X, const int incX,
                 const void *beta, void *Y, const int incY);
void cblas_cgeru(const enum CBLAS_ORDER order, const int M, const int N,
                 const void *alpha, const void *X, const int incX,
                 const void *Y, const int incY, void *A, const int lda);
void cblas_cgerc(const enum CBLAS_ORDER order, const int M, const int N,
                 const void *alpha, const void *X, const int incX,
                 const void *Y, const int incY, void *A, const int lda);
void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                const int N, const float alpha, const void *X, const int incX,
                void *A, const int lda);
void cblas_chpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                const int N, const float *alpha, const void *X,
                const int incX, void *A);
void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N,
                const void *alpha, const void *X, const int incX,
                const void *Y, const int incY, void *A, const int lda);
void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N,
                const void *alpha, const void *X, const int incX,
                const void *Y, const int incY, void *Ap);

void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const int N, const void *alpha, const void *A,
                 const int lda, const void *X, const int incX,
                 const void *beta, void *Y, const int incY);
void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const int N, const int K, const void *alpha, const void *A,
                 const int lda, const void *X, const int incX,
                 const void *beta, void *Y, const int incY);
void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                 const int N, const void *alpha, const void *Ap,
                 const void *X, const int incX,
                 const void *beta, void *Y, const int incY);
void cblas_zgeru(const enum CBLAS_ORDER order, const int M, const int N,
                 const void *alpha, const void *X, const int incX,
                 const void *Y, const int incY, void *A, const int lda);
void cblas_zgerc(const enum CBLAS_ORDER order, const int M, const int N,
                 const void *alpha, const void *X, const int incX,
                 const void *Y, const int incY, void *A, const int lda);
void cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                const int N, const double alpha, const void *X, const int incX,
                void *A, const int lda);
void cblas_zhpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,
                const int N, const double *alpha, const void *X,
                const int incX, void *A);
void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N,
                const void *alpha, const void *X, const int incX,
                const void *Y, const int incY, void *A, const int lda);
void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const int N,
                const void *alpha, const void *X, const int incX,
                const void *Y, const int incY, void *Ap);




void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
                 const int K, const float alpha, const float *A,
                 const int lda, const float *B, const int ldb,
                 const float beta, float *C, const int ldc);
void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
                 const enum CBLAS_UPLO Uplo, const int M, const int N,
                 const float alpha, const float *A, const int lda,
                 const float *B, const int ldb, const float beta,
                 float *C, const int ldc);
void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
                 const float alpha, const float *A, const int lda,
                 const float beta, float *C, const int ldc);
void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
                  const float alpha, const float *A, const int lda,
                  const float *B, const int ldb, const float beta,
                  float *C, const int ldc);
void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
                 const enum CBLAS_DIAG Diag, const int M, const int N,
                 const float alpha, const float *A, const int lda,
                 float *B, const int ldb);
void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
                 const enum CBLAS_DIAG Diag, const int M, const int N,
                 const float alpha, const float *A, const int lda,
                 float *B, const int ldb);

void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
                 const int K, const double alpha, const double *A,
                 const int lda, const double *B, const int ldb,
                 const double beta, double *C, const int ldc);
void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
                 const enum CBLAS_UPLO Uplo, const int M, const int N,
                 const double alpha, const double *A, const int lda,
                 const double *B, const int ldb, const double beta,
                 double *C, const int ldc);
void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
                 const double alpha, const double *A, const int lda,
                 const double beta, double *C, const int ldc);
void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
                  const double alpha, const double *A, const int lda,
                  const double *B, const int ldb, const double beta,
                  double *C, const int ldc);
void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
                 const enum CBLAS_DIAG Diag, const int M, const int N,
                 const double alpha, const double *A, const int lda,
                 double *B, const int ldb);
void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
                 const enum CBLAS_DIAG Diag, const int M, const int N,
                 const double alpha, const double *A, const int lda,
                 double *B, const int ldb);

void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
                 const int K, const void *alpha, const void *A,
                 const int lda, const void *B, const int ldb,
                 const void *beta, void *C, const int ldc);
void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
                 const enum CBLAS_UPLO Uplo, const int M, const int N,
                 const void *alpha, const void *A, const int lda,
                 const void *B, const int ldb, const void *beta,
                 void *C, const int ldc);
void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
                 const void *alpha, const void *A, const int lda,
                 const void *beta, void *C, const int ldc);
void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
                  const void *alpha, const void *A, const int lda,
                  const void *B, const int ldb, const void *beta,
                  void *C, const int ldc);
void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
                 const enum CBLAS_DIAG Diag, const int M, const int N,
                 const void *alpha, const void *A, const int lda,
                 void *B, const int ldb);
void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
                 const enum CBLAS_DIAG Diag, const int M, const int N,
                 const void *alpha, const void *A, const int lda,
                 void *B, const int ldb);

void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
                 const int K, const void *alpha, const void *A,
                 const int lda, const void *B, const int ldb,
                 const void *beta, void *C, const int ldc);
void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
                 const enum CBLAS_UPLO Uplo, const int M, const int N,
                 const void *alpha, const void *A, const int lda,
                 const void *B, const int ldb, const void *beta,
                 void *C, const int ldc);
void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
                 const void *alpha, const void *A, const int lda,
                 const void *beta, void *C, const int ldc);
void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
                  const void *alpha, const void *A, const int lda,
                  const void *B, const int ldb, const void *beta,
                  void *C, const int ldc);
void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
                 const enum CBLAS_DIAG Diag, const int M, const int N,
                 const void *alpha, const void *A, const int lda,
                 void *B, const int ldb);
void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
                 const enum CBLAS_DIAG Diag, const int M, const int N,
                 const void *alpha, const void *A, const int lda,
                 void *B, const int ldb);



void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
                 const enum CBLAS_UPLO Uplo, const int M, const int N,
                 const void *alpha, const void *A, const int lda,
                 const void *B, const int ldb, const void *beta,
                 void *C, const int ldc);
void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
                 const float alpha, const void *A, const int lda,
                 const float beta, void *C, const int ldc);
void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
                  const void *alpha, const void *A, const int lda,
                  const void *B, const int ldb, const float beta,
                  void *C, const int ldc);

void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
                 const enum CBLAS_UPLO Uplo, const int M, const int N,
                 const void *alpha, const void *A, const int lda,
                 const void *B, const int ldb, const void *beta,
                 void *C, const int ldc);
void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
                 const double alpha, const void *A, const int lda,
                 const double beta, void *C, const int ldc);
void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
                  const void *alpha, const void *A, const int lda,
                  const void *B, const int ldb, const double beta,
                  void *C, const int ldc);
// end blis_prototypes_cblas.h
#else
// begin blis_prototypes_blas.h


// --- Name-mangling macro definitions -----------------------------------------

// --- Name-mangle level-1 BLAS routines ---------------------------

#define F77_isamax F77_FUNC( isamax , ISAMAX )
#define F77_idamax F77_FUNC( idamax , IDAMAX )
#define F77_icamax F77_FUNC( icamax , ICAMAX )
#define F77_izamax F77_FUNC( izamax , IZAMAX )
#define F77_sasum  F77_FUNC( sasum  , SASUM  )
#define F77_dasum  F77_FUNC( dasum  , DASUM  )
#define F77_scasum F77_FUNC( scasum , SCASUM )
#define F77_dzasum F77_FUNC( dzasum , DZASUM )
#define F77_saxpy  F77_FUNC( saxpy  , SAXPY  )
#define F77_daxpy  F77_FUNC( daxpy  , DAXPY  )
#define F77_caxpy  F77_FUNC( caxpy  , CAXPY  )
#define F77_zaxpy  F77_FUNC( zaxpy  , ZAXPY  )
#define F77_scopy  F77_FUNC( scopy  , SCOPY  )
#define F77_dcopy  F77_FUNC( dcopy  , DCOPY  )
#define F77_ccopy  F77_FUNC( ccopy  , CCOPY  )
#define F77_zcopy  F77_FUNC( zcopy  , ZCOPY  )
#define F77_sdot   F77_FUNC( sdot   , SDOT   )
#define F77_ddot   F77_FUNC( ddot   , DDOT   )
#define F77_cdotu  F77_FUNC( cdotu  , CDOTU  )
#define F77_cdotc  F77_FUNC( cdotc  , CDOTC  )
#define F77_zdotu  F77_FUNC( zdotu  , ZDOTU  )
#define F77_zdotc  F77_FUNC( zdotc  , ZDOTC  )
#define F77_snrm2  F77_FUNC( snrm2  , SNRM2  )
#define F77_dnrm2  F77_FUNC( dnrm2  , DNRM2  )
#define F77_scnrm2 F77_FUNC( scnrm2 , SCNRM2 )
#define F77_dznrm2 F77_FUNC( dznrm2 , DZNRM2 )
#define F77_sscal  F77_FUNC( sscal  , SSCAL  )
#define F77_dscal  F77_FUNC( dscal  , DSCAL  )
#define F77_cscal  F77_FUNC( cscal  , CSCAL  )
#define F77_csscal F77_FUNC( csscal , CSSCAL )
#define F77_zscal  F77_FUNC( zscal  , ZSCAL  )
#define F77_zdscal F77_FUNC( zdscal , ZDSCAL )
#define F77_sswap  F77_FUNC( sswap  , SSWAP  )
#define F77_dswap  F77_FUNC( dswap  , DSWAP  )
#define F77_cswap  F77_FUNC( cswap  , CSWAP  )
#define F77_zswap  F77_FUNC( zswap  , ZSWAP  )

// --- Name-mangle level-2 BLAS routines ---------------------------

#define F77_sgemv  F77_FUNC( sgemv  , SGEMV  )
#define F77_dgemv  F77_FUNC( dgemv  , DGEMV  )
#define F77_cgemv  F77_FUNC( cgemv  , CGEMV  )
#define F77_zgemv  F77_FUNC( zgemv  , ZGEMV  )
#define F77_sger   F77_FUNC( sger   , SGER   )
#define F77_dger   F77_FUNC( dger   , DGER   )
#define F77_cgerc  F77_FUNC( cgerc  , CGERC  )
#define F77_cgeru  F77_FUNC( cgeru  , CGERU  )
#define F77_zgerc  F77_FUNC( zgerc  , ZGERC  )
#define F77_zgeru  F77_FUNC( zgeru  , ZGERU  )
#define F77_chemv  F77_FUNC( chemv  , CHEMV  )
#define F77_zhemv  F77_FUNC( zhemv  , ZHEMV  )
#define F77_cher   F77_FUNC( cher   , CHER   )
#define F77_zher   F77_FUNC( zher   , ZHER   )
#define F77_cher2  F77_FUNC( cher2  , CHER2  )
#define F77_zher2  F77_FUNC( zher2  , ZHER2  )
#define F77_ssymv  F77_FUNC( ssymv  , SSYMV  )
#define F77_dsymv  F77_FUNC( dsymv  , DSYMV  )
#define F77_ssyr   F77_FUNC( ssyr   , SSYR   )
#define F77_dsyr   F77_FUNC( dsyr   , DSYR   )
#define F77_ssyr2  F77_FUNC( ssyr2  , SSYR2  )
#define F77_dsyr2  F77_FUNC( dsyr2  , DSYR2  )
#define F77_strmv  F77_FUNC( strmv  , STRMV  )
#define F77_dtrmv  F77_FUNC( dtrmv  , DTRMV  )
#define F77_ctrmv  F77_FUNC( ctrmv  , CTRMV  )
#define F77_ztrmv  F77_FUNC( ztrmv  , ZTRMV  )
#define F77_strsv  F77_FUNC( strsv  , STRSV  )
#define F77_dtrsv  F77_FUNC( dtrsv  , DTRSV  )
#define F77_ctrsv  F77_FUNC( ctrsv  , CTRSV  )
#define F77_ztrsv  F77_FUNC( ztrsv  , ZTRSV  )

// --- Name-mangle level-3 BLAS routines ---------------------------

#define F77_sgemm  F77_FUNC( sgemm  , SGEMM  )
#define F77_dgemm  F77_FUNC( dgemm  , DGEMM  )
#define F77_cgemm  F77_FUNC( cgemm  , CGEMM  )
#define F77_zgemm  F77_FUNC( zgemm  , ZGEMM  )
#define F77_chemm  F77_FUNC( chemm  , CHEMM  )
#define F77_zhemm  F77_FUNC( zhemm  , ZHEMM  )
#define F77_cherk  F77_FUNC( cherk  , CHERK  )
#define F77_zherk  F77_FUNC( zherk  , ZHERK  )
#define F77_cher2k F77_FUNC( cher2k , CHER2K )
#define F77_zher2k F77_FUNC( zher2k , ZHER2K )
#define F77_ssymm  F77_FUNC( ssymm  , SSYMM  )
#define F77_dsymm  F77_FUNC( dsymm  , DSYMM  )
#define F77_csymm  F77_FUNC( csymm  , CSYMM  )
#define F77_zsymm  F77_FUNC( zsymm  , ZSYMM  )
#define F77_ssyrk  F77_FUNC( ssyrk  , SSYRK  )
#define F77_dsyrk  F77_FUNC( dsyrk  , DSYRK  )
#define F77_csyrk  F77_FUNC( csyrk  , CSYRK  )
#define F77_zsyrk  F77_FUNC( zsyrk  , ZSYRK  )
#define F77_ssyr2k F77_FUNC( ssyr2k , SSYR2K )
#define F77_dsyr2k F77_FUNC( dsyr2k , DSYR2K )
#define F77_csyr2k F77_FUNC( csyr2k , CSYR2K )
#define F77_zsyr2k F77_FUNC( zsyr2k , ZSYR2K )
#define F77_strmm  F77_FUNC( strmm  , STRMM  )
#define F77_dtrmm  F77_FUNC( dtrmm  , DTRMM  )
#define F77_ctrmm  F77_FUNC( ctrmm  , CTRMM  )
#define F77_ztrmm  F77_FUNC( ztrmm  , ZTRMM  )
#define F77_strsm  F77_FUNC( strsm  , STRSM  )
#define F77_dtrsm  F77_FUNC( dtrsm  , DTRSM  )
#define F77_ctrsm  F77_FUNC( ctrsm  , CTRSM  )
#define F77_ztrsm  F77_FUNC( ztrsm  , ZTRSM  )


// --- Prototypes --------------------------------------------------------------

// --- Level-1 BLAS prototypes -------------------

// --- amax ---
int      F77_isamax ( int* n, float*    x, int* incx );
int      F77_idamax ( int* n, double*   x, int* incx );
int      F77_icamax ( int* n, scomplex* x, int* incx );
int      F77_izamax ( int* n, dcomplex* x, int* incx );
// --- asum ---
float    F77_sasum  ( int* n, float*    x, int* incx );
double   F77_dasum  ( int* n, double*   x, int* incx );
float    F77_scasum ( int* n, scomplex* x, int* incx );
double   F77_dzasum ( int* n, dcomplex* x, int* incx );
// --- axpy ---
void     F77_saxpy  ( int* n, float*    alpha, float*    x, int* incx,  float*    y, int* incy );
void     F77_daxpy  ( int* n, double*   alpha, double*   x, int* incx,  double*   y, int* incy );
void     F77_caxpy  ( int* n, scomplex* alpha, scomplex* x, int* incx,  scomplex* y, int* incy );
void     F77_zaxpy  ( int* n, dcomplex* alpha, dcomplex* x, int* incx,  dcomplex* y, int* incy );
// --- copy ---
void     F77_scopy  ( int* n, float*    x, int* incx, float*    y, int* incy );
void     F77_dcopy  ( int* n, double*   x, int* incx, double*   y, int* incy );
void     F77_ccopy  ( int* n, scomplex* x, int* incx, scomplex* y, int* incy );
void     F77_zcopy  ( int* n, dcomplex* x, int* incx, dcomplex* y, int* incy );
// --- dot ---
float    F77_sdot   ( int* n, float*    x, int* incx, float*    y, int* incy );
double   F77_ddot   ( int* n, double*   x, int* incx, double*   y, int* incy );
scomplex F77_cdotu  ( int* n, scomplex* x, int* incx, scomplex* y, int* incy );
scomplex F77_cdotc  ( int* n, scomplex* x, int* incx, scomplex* y, int* incy );
dcomplex F77_zdotu  ( int* n, dcomplex* x, int* incx, dcomplex* y, int* incy );
dcomplex F77_zdotc  ( int* n, dcomplex* x, int* incx, dcomplex* y, int* incy );
// --- nrm2 ---
float    F77_snrm2  ( int* n, float*    x, int* incx );
double   F77_dnrm2  ( int* n, double*   x, int* incx );
float    F77_scnrm2 ( int* n, scomplex* x, int* incx );
double   F77_dznrm2 ( int* n, dcomplex* x, int* incx );
// --- scal ---
void     F77_sscal  ( int* n, float*    alpha, float*    y, int* incy );
void     F77_dscal  ( int* n, double*   alpha, double*   y, int* incy );
void     F77_cscal  ( int* n, scomplex* alpha, scomplex* y, int* incy );
void     F77_csscal ( int* n, float*    alpha, scomplex* y, int* incy );
void     F77_zscal  ( int* n, dcomplex* alpha, dcomplex* y, int* incy );
void     F77_zdscal ( int* n, double*   alpha, dcomplex* y, int* incy );
// --- swap ---
void     F77_sswap  ( int* n, float*    x, int* incx, float*    y, int* incy );
void     F77_dswap  ( int* n, double*   x, int* incx, double*   y, int* incy );
void     F77_cswap  ( int* n, scomplex* x, int* incx, scomplex* y, int* incy );
void     F77_zswap  ( int* n, dcomplex* x, int* incx, dcomplex* y, int* incy );

// --- Level-2 BLAS prototypes -------------------

// --- gemv ---
void     F77_sgemv  ( char* transa, int* m, int* n, float*    alpha, float*    a, int* lda, float*    x, int* incx, float*    beta, float*    y, int* incy );
void     F77_dgemv  ( char* transa, int* m, int* n, double*   alpha, double*   a, int* lda, double*   x, int* incx, double*   beta, double*   y, int* incy );
void     F77_cgemv  ( char* transa, int* m, int* n, scomplex* alpha, scomplex* a, int* lda, scomplex* x, int* incx, scomplex* beta, scomplex* y, int* incy );
void     F77_zgemv  ( char* transa, int* m, int* n, dcomplex* alpha, dcomplex* a, int* lda, dcomplex* x, int* incx, dcomplex* beta, dcomplex* y, int* incy );
// --- ger ---
void     F77_sger   ( int* m, int* n, float*    alpha, float*    x, int* incx, float*    y, int* incy, float*    a, int* lda );
void     F77_dger   ( int* m, int* n, double*   alpha, double*   x, int* incx, double*   y, int* incy, double*   a, int* lda );
void     F77_cgerc  ( int* m, int* n, scomplex* alpha, scomplex* x, int* incx, scomplex* y, int* incy, scomplex* a, int* lda );
void     F77_cgeru  ( int* m, int* n, scomplex* alpha, scomplex* x, int* incx, scomplex* y, int* incy, scomplex* a, int* lda );
void     F77_zgerc  ( int* m, int* n, dcomplex* alpha, dcomplex* x, int* incx, dcomplex* y, int* incy, dcomplex* a, int* lda );
void     F77_zgeru  ( int* m, int* n, dcomplex* alpha, dcomplex* x, int* incx, dcomplex* y, int* incy, dcomplex* a, int* lda );
// --- hemv ---
void     F77_chemv  ( char* uplo, int* n, scomplex* alpha, scomplex* a, int* lda, scomplex* x, int* incx, scomplex* beta, scomplex* y, int* incy );
void     F77_zhemv  ( char* uplo, int* n, dcomplex* alpha, dcomplex* a, int* lda, dcomplex* x, int* incx, dcomplex* beta, dcomplex* y, int* incy );
// --- her ---
void     F77_cher   ( char* uplo, int* n, float*    alpha, scomplex* x, int* incx, scomplex* a, int* lda );
void     F77_zher   ( char* uplo, int* n, double*   alpha, dcomplex* x, int* incx, dcomplex* a, int* lda );
// --- her2 ---
void     F77_cher2  ( char* uplo, int* n, scomplex* alpha, scomplex* x, int* incx, scomplex* y, int* incy, scomplex* a, int* lda );
void     F77_zher2  ( char* uplo, int* n, dcomplex* alpha, dcomplex* x, int* incx, dcomplex* y, int* incy, dcomplex* a, int* lda );
// --- symv ---
void     F77_ssymv  ( char* uplo, int* n, float*    alpha, float*    a, int* lda, float*    x, int* incx, float*    beta, float*    y, int* incy );
void     F77_dsymv  ( char* uplo, int* n, double*   alpha, double*   a, int* lda, double*   x, int* incx, double*   beta, double*   y, int* incy );
// --- syr ---
void     F77_ssyr   ( char* uplo, int* n, float*    alpha, float*    x, int* incx, float*    a, int* lda );
void     F77_dsyr   ( char* uplo, int* n, double*   alpha, double*   x, int* incx, double*   a, int* lda );
// --- syr2 ---
void     F77_ssyr2  ( char* uplo, int* n, float*    alpha, float*    x, int* incx, float*    y, int* incy, float*    a, int* lda );
void     F77_dsyr2  ( char* uplo, int* n, double*   alpha, double*   x, int* incx, double*   y, int* incy, double*   a, int* lda );
// --- trmv ---
void     F77_strmv  ( char* uplo, char* transa, char* diag, int* n,  float*    a, int* lda, float*    y, int* incy );
void     F77_dtrmv  ( char* uplo, char* transa, char* diag, int* n,  double*   a, int* lda, double*   y, int* incy );
void     F77_ctrmv  ( char* uplo, char* transa, char* diag, int* n,  scomplex* a, int* lda, scomplex* y, int* incy );
void     F77_ztrmv  ( char* uplo, char* transa, char* diag, int* n,  dcomplex* a, int* lda, dcomplex* y, int* incy );
// --- trsv ---
void     F77_strsv  ( char* uplo, char* transa, char* diag, int* n,  float*    a, int* lda, float*    y, int* incy );
void     F77_dtrsv  ( char* uplo, char* transa, char* diag, int* n,  double*   a, int* lda, double*   y, int* incy );
void     F77_ctrsv  ( char* uplo, char* transa, char* diag, int* n,  scomplex* a, int* lda, scomplex* y, int* incy );
void     F77_ztrsv  ( char* uplo, char* transa, char* diag, int* n,  dcomplex* a, int* lda, dcomplex* y, int* incy );

// --- Level-3 BLAS prototypes -------------------

// --- gemm ---
void     F77_sgemm  ( char* transa, char* transb, int* m, int* n, int* k, float*    alpha, float*    a, int* lda, float*    b, int* ldb, float*    beta, float*    c, int* ldc );
void     F77_dgemm  ( char* transa, char* transb, int* m, int* n, int* k, double*   alpha, double*   a, int* lda, double*   b, int* ldb, double*   beta, double*   c, int* ldc );
void     F77_cgemm  ( char* transa, char* transb, int* m, int* n, int* k, scomplex* alpha, scomplex* a, int* lda, scomplex* b, int* ldb, scomplex* beta, scomplex* c, int* ldc );
void     F77_zgemm  ( char* transa, char* transb, int* m, int* n, int* k, dcomplex* alpha, dcomplex* a, int* lda, dcomplex* b, int* ldb, dcomplex* beta, dcomplex* c, int* ldc );
// --- hemm ---
void     F77_chemm  ( char* side, char* uplo, int* m, int* n, scomplex* alpha, scomplex* a, int* lda, scomplex* b, int* ldb, scomplex* beta, scomplex* c, int* ldc );
void     F77_zhemm  ( char* side, char* uplo, int* m, int* n, dcomplex* alpha, dcomplex* a, int* lda, dcomplex* b, int* ldb, dcomplex* beta, dcomplex* c, int* ldc );
// --- herk ---
void     F77_cherk  ( char* uplo, char* transa, int* n, int* k, float*  alpha, scomplex* a, int* lda, float*  beta, scomplex* c, int* ldc );
void     F77_zherk  ( char* uplo, char* transa, int* n, int* k, double* alpha, dcomplex* a, int* lda, double* beta, dcomplex* c, int* ldc );
// --- her2k ---
void     F77_cher2k ( char* uplo, char* transa, int* n, int* k, scomplex* alpha, scomplex* a, int* lda, scomplex* b, int* ldb, float*  beta, scomplex* c, int* ldc );
void     F77_zher2k ( char* uplo, char* transa, int* n, int* k, dcomplex* alpha, dcomplex* a, int* lda, dcomplex* b, int* ldb, double* beta, dcomplex* c, int* ldc );
// --- symm ---
void     F77_ssymm  ( char* side, char* uplo, int* m, int* n, float*    alpha, float*    a, int* lda, float*    b, int* ldb, float*    beta, float*    c, int* ldc );
void     F77_dsymm  ( char* side, char* uplo, int* m, int* n, double*   alpha, double*   a, int* lda, double*   b, int* ldb, double*   beta, double*   c, int* ldc );
void     F77_csymm  ( char* side, char* uplo, int* m, int* n, scomplex* alpha, scomplex* a, int* lda, scomplex* b, int* ldb, scomplex* beta, scomplex* c, int* ldc );
void     F77_zsymm  ( char* side, char* uplo, int* m, int* n, dcomplex* alpha, dcomplex* a, int* lda, dcomplex* b, int* ldb, dcomplex* beta, dcomplex* c, int* ldc );
// --- syrk ---
void     F77_ssyrk  ( char* uplo, char* transa, int* n, int* k, float*    alpha, float*    a, int* lda, float*    beta, float*    c, int* ldc );
void     F77_dsyrk  ( char* uplo, char* transa, int* n, int* k, double*   alpha, double*   a, int* lda, double*   beta, double*   c, int* ldc );
void     F77_csyrk  ( char* uplo, char* transa, int* n, int* k, scomplex* alpha, scomplex* a, int* lda, scomplex* beta, scomplex* c, int* ldc );
void     F77_zsyrk  ( char* uplo, char* transa, int* n, int* k, dcomplex* alpha, dcomplex* a, int* lda, dcomplex* beta, dcomplex* c, int* ldc );
// --- syr2k ---
void     F77_ssyr2k ( char* uplo, char* transa, int* n, int* k, float*    alpha, float*    a, int* lda, float*    b, int* ldb, float*    beta, float*    c, int* ldc );
void     F77_dsyr2k ( char* uplo, char* transa, int* n, int* k, double*   alpha, double*   a, int* lda, double*   b, int* ldb, double*   beta, double*   c, int* ldc );
void     F77_csyr2k ( char* uplo, char* transa, int* n, int* k, scomplex* alpha, scomplex* a, int* lda, scomplex* b, int* ldb, scomplex* beta, scomplex* c, int* ldc );
void     F77_zsyr2k ( char* uplo, char* transa, int* n, int* k, dcomplex* alpha, dcomplex* a, int* lda, dcomplex* b, int* ldb, dcomplex* beta, dcomplex* c, int* ldc );
// --- trmm ---
void     F77_strmm  ( char* side, char* uplo, char* transa, char* diag, int* m, int* n, float*    alpha, float*    a, int* lda, float*    b, int* ldb );
void     F77_dtrmm  ( char* side, char* uplo, char* transa, char* diag, int* m, int* n, double*   alpha, double*   a, int* lda, double*   b, int* ldb );
void     F77_ctrmm  ( char* side, char* uplo, char* transa, char* diag, int* m, int* n, scomplex* alpha, scomplex* a, int* lda, scomplex* b, int* ldb );
void     F77_ztrmm  ( char* side, char* uplo, char* transa, char* diag, int* m, int* n, dcomplex* alpha, dcomplex* a, int* lda, dcomplex* b, int* ldb );
// --- trsm ---
void     F77_strsm  ( char* side, char* uplo, char* transa, char* diag, int* m, int* n, float*    alpha, float*    a, int* lda, float*    b, int* ldb );
void     F77_dtrsm  ( char* side, char* uplo, char* transa, char* diag, int* m, int* n, double*   alpha, double*   a, int* lda, double*   b, int* ldb );
void     F77_ctrsm  ( char* side, char* uplo, char* transa, char* diag, int* m, int* n, scomplex* alpha, scomplex* a, int* lda, scomplex* b, int* ldb );
void     F77_ztrsm  ( char* side, char* uplo, char* transa, char* diag, int* m, int* n, dcomplex* alpha, dcomplex* a, int* lda, dcomplex* b, int* ldb );

// end blis_prototypes_blas.h
#endif

// End extern "C" construct block.
#ifdef __cplusplus
}
#endif

#endif
// end blis1.h

  // Include f2c definitions.
// begin FLA_f2c.h

// f2c.h  --  Standard Fortran to C header file
//  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
//  - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition)

#include <stdio.h> // skipped
#include <string.h> // skipped
#include <stdlib.h> // skipped
#include <math.h> // skipped
#include <complex.h> // skipped
#undef complex

#ifndef F2C_INCLUDE
#define F2C_INCLUDE


typedef int integer; 

typedef unsigned long int uinteger;
typedef char *address;
typedef short int shortint;
typedef float real;
typedef double doublereal;
typedef struct { real r, i; } complex;
typedef struct { doublereal r, i; } doublecomplex;


typedef int logical; 

typedef short int shortlogical;
typedef char logical1;
typedef char integer1;
#ifdef INTEGER_STAR_8	
typedef long long longint;		
typedef unsigned long long ulongint;	
#define qbit_clear(a,b)	((a) & ~((ulongint)1 << (b)))
#define qbit_set(a,b)	((a) |  ((ulongint)1 << (b)))
#endif

#define TRUE_ (1)
#define FALSE_ (0)


#ifndef Extern
#define Extern extern
#endif



#ifdef f2c_i2

typedef short flag;
typedef short ftnlen;
typedef short ftnint;
#else
typedef long int flag;
typedef long int ftnlen;
typedef long int ftnint;
#endif


typedef struct
{	flag cierr;
	ftnint ciunit;
	flag ciend;
	char *cifmt;
	ftnint cirec;
} cilist;


typedef struct
{	flag icierr;
	char *iciunit;
	flag iciend;
	char *icifmt;
	ftnint icirlen;
	ftnint icirnum;
} icilist;


typedef struct
{	flag oerr;
	ftnint ounit;
	char *ofnm;
	ftnlen ofnmlen;
	char *osta;
	char *oacc;
	char *ofm;
	ftnint orl;
	char *oblnk;
} olist;


typedef struct
{	flag cerr;
	ftnint cunit;
	char *csta;
} cllist;


typedef struct
{	flag aerr;
	ftnint aunit;
} alist;


typedef struct
{	flag inerr;
	ftnint inunit;
	char *infile;
	ftnlen infilen;
	ftnint	*inex;	
	ftnint	*inopen;
	ftnint	*innum;
	ftnint	*innamed;
	char	*inname;
	ftnlen	innamlen;
	char	*inacc;
	ftnlen	inacclen;
	char	*inseq;
	ftnlen	inseqlen;
	char 	*indir;
	ftnlen	indirlen;
	char	*infmt;
	ftnlen	infmtlen;
	char	*inform;
	ftnint	informlen;
	char	*inunf;
	ftnlen	inunflen;
	ftnint	*inrecl;
	ftnint	*innrec;
	char	*inblank;
	ftnlen	inblanklen;
} inlist;

#define VOID void

union Multitype {	
	integer1 g;
	shortint h;
	integer i;
	
	real r;
	doublereal d;
	complex c;
	doublecomplex z;
	};

typedef union Multitype Multitype;

	

struct Vardesc {	
	char *name;
	char *addr;
	ftnlen *dims;
	int  type;
	};
typedef struct Vardesc Vardesc;

struct Namelist {
	char *name;
	Vardesc **vars;
	int nvars;
	};
typedef struct Namelist Namelist;

#ifndef f2c_abs
  #define f2c_abs(x) ((x) >= 0 ? (x) : -(x))
#endif
#ifndef f2c_dabs
  #define f2c_dabs(x) (doublereal)f2c_abs(x)
#endif
#ifndef min
  #define min(a,b) ((a) <= (b) ? (a) : (b))
#endif
#ifndef max
  #define max(a,b) ((a) >= (b) ? (a) : (b))
#endif
#ifndef dmin
  #define dmin(a,b) (doublereal)min(a,b)
#endif
#ifndef dmax
  #define dmax(a,b) (doublereal)max(a,b)
#endif

#define bit_test(a,b)	((a) >> (b) & 1)
#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))



#define F2C_proc_par_types 1
#ifdef __cplusplus
typedef int  (*U_fp)(...);
typedef shortint (*J_fp)(...);
typedef integer (*I_fp)(...);
typedef real (*R_fp)(...);
typedef doublereal (*D_fp)(...);
typedef doublereal (*E_fp)(...);
typedef  VOID (*C_fp)(...);
typedef  VOID (*Z_fp)(...);
typedef logical (*L_fp)(...);
typedef shortlogical (*K_fp)(...);
typedef  VOID (*H_fp)(...);
typedef  int (*S_fp)(...);
#else
typedef int  (*U_fp)();
typedef shortint (*J_fp)();
typedef integer (*I_fp)();
typedef real (*R_fp)();
typedef doublereal (*D_fp)();
typedef doublereal (*E_fp)();
typedef  VOID (*C_fp)();
typedef  VOID (*Z_fp)();
typedef logical (*L_fp)();
typedef shortlogical (*K_fp)();
typedef  VOID (*H_fp)();
typedef  int (*S_fp)();
#endif

typedef VOID C_f;	
typedef VOID H_f;	
typedef VOID Z_f;	
typedef doublereal E_f;	



#ifndef Skip_f2c_Undefs
#undef cray
#undef gcos
#undef mc68010
#undef mc68020
#undef mips
#undef pdp11
#undef sgi
#undef sparc
#undef sun
#undef sun2
#undef sun3
#undef sun4
#undef u370
#undef u3b
#undef u3b2
#undef u3b5
#undef unix
#undef vax
#endif
#endif
// end FLA_f2c.h

  // Include general FLAME macro and _PTR macro definitions.
// begin FLA_macro_defs.h




// --- Miscellaneous macro definitions -----------------------------------------

#undef  NULL
#define NULL 0

#ifdef FLA_ENABLE_WINDOWS_BUILD
  #define restrict  __restrict
#endif


// --- Type-related macro definitions ------------------------------------------

// FLA_Bool
#undef  TRUE
#undef  FALSE
#define TRUE  1
#define FALSE 0

// FLA_Error (non-specific)
#define FLA_SUCCESS           (-1)
#define FLA_FAILURE           (-2)

// FLA_Quadrant
#define FLA_TL                 11
#define FLA_TR                 12
#define FLA_BL                 21
#define FLA_BR                 22

// FLA_Datatype
#define FLA_FLOAT             100
#define FLA_DOUBLE            101
#define FLA_COMPLEX           102
#define FLA_DOUBLE_COMPLEX    103
#define FLA_INT               104
#define FLA_CONSTANT          105

// FLA_Elemtype
#define FLA_MATRIX            150
#define FLA_SCALAR            151

// FLA_Side
#define FLA_TOP               200
#define FLA_BOTTOM            201
#define FLA_LEFT              210
#define FLA_RIGHT             211
#define FLA_SIDE_MASK         0x1

// FLA_Uplo
#define FLA_LOWER_TRIANGULAR  300
#define FLA_UPPER_TRIANGULAR  301
#define FLA_ZERO_MATRIX       310
#define FLA_FULL_MATRIX       311
#define FLA_UPLO_MASK         0x1

// FLA_Trans
#define FLA_NO_TRANSPOSE      400
#define FLA_TRANSPOSE         401
#define FLA_CONJ_TRANSPOSE    402
#define FLA_CONJ_NO_TRANSPOSE 403
#define FLA_TRANS_MASK        0x3

// FLA_Conj
#define FLA_NO_CONJUGATE      450
#define FLA_CONJUGATE         451

// FLA_Diag
#define FLA_UNIT_DIAG         500
#define FLA_NONUNIT_DIAG      501
#define FLA_ZERO_DIAG         502
#define FLA_DIAG_MASK         0x3

// FLA_Dimension
#define FLA_DIMENSION_M       600
#define FLA_DIMENSION_K       601
#define FLA_DIMENSION_N       602
#define FLA_DIMENSION_MIN     603

// FLA_Dimension_index
#define FLA_DIM_M_INDEX         0
#define FLA_DIM_K_INDEX         1
#define FLA_DIM_N_INDEX         2
#define FLA_DIM_MIN_INDEX       3
#define FLA_DIM_INDEX_MASK    0x3

// FLA_Pivot_type
#define FLA_NATIVE_PIVOTS     700
#define FLA_LAPACK_PIVOTS     701

// FLA_Direct
#define FLA_FORWARD           800
#define FLA_BACKWARD          801

// FLA_Store
#define FLA_COLUMNWISE        900
#define FLA_ROWWISE           901

// FLA_Matrix_type
#define FLA_FLAT             1000
#define FLA_HIER             1001

// FLA_Precision
#define FLA_SINGLE_PRECISION 1100
#define FLA_DOUBLE_PRECISION 1101

// FLA_Domain
#define FLA_REAL_DOMAIN      1200
#define FLA_COMPLEX_DOMAIN   1201

// FLA_Inv    
#define FLA_NO_INVERSE       1300
#define FLA_INVERSE          1301

// FLA_Evd_type
#define FLA_EVD_WITHOUT_VECTORS         1400
#define FLA_EVD_WITH_VECTORS            1401
#define FLA_EVD_OF_TRIDIAG_WITH_VECTORS 1402

// FLA_Svd_type
#define FLA_SVD_VECTORS_ALL           1500
#define FLA_SVD_VECTORS_MIN_COPY      1501
#define FLA_SVD_VECTORS_MIN_OVERWRITE 1502
#define FLA_SVD_VECTORS_NONE          1503

// FLA_Machval
#define FLA_MACH_START                1600
#define FLA_MACH_EPS                  1600
#define FLA_MACH_SFMIN                1601
#define FLA_MACH_BASE                 1602
#define FLA_MACH_PREC                 1603
#define FLA_MACH_NDIGMANT             1604
#define FLA_MACH_RND                  1605
#define FLA_MACH_EMIN                 1606
#define FLA_MACH_RMIN                 1607
#define FLA_MACH_EMAX                 1608
#define FLA_MACH_RMAX                 1609
#define FLA_MACH_EPS2                 1610
#define FLA_MACH_N_VALS                 11

// FLA_Diag_off
#define FLA_SUPER_DIAGONAL     ( 1)
#define FLA_MAIN_DIAGONAL        0
#define FLA_SUB_DIAGONAL       (-1)

// FLAME threading model
#define FLA_OPENMP              1
#define FLA_PTHREADS            2

// FLAME vector intrinsics types
#define FLA_NO_INTRINSICS       0
#define FLA_SSE_INTRINSICS      3

// FLAME internal error checking level
#define FLA_FULL_ERROR_CHECKING 2
#define FLA_MIN_ERROR_CHECKING  1
#define FLA_NO_ERROR_CHECKING   0

// FLA_Datatype_index
#define FLA_S_INDEX             0
#define FLA_D_INDEX             1
#define FLA_C_INDEX             2
#define FLA_Z_INDEX             3
#define FLA_DTYPE_INDEX_MASK  0x3

// Default blocksize if none are available.
#ifndef FLA_DEFAULT_M_BLOCKSIZE
  #define FLA_DEFAULT_M_BLOCKSIZE  128
#endif
#ifndef FLA_DEFAULT_K_BLOCKSIZE
  #define FLA_DEFAULT_K_BLOCKSIZE  128
#endif
#ifndef FLA_DEFAULT_N_BLOCKSIZE
  #define FLA_DEFAULT_N_BLOCKSIZE  128
#endif

// QR and LQ factorizations typically has an inner blocksize that corresponds
// to the length of the S (or T) block Householder matrix. For consistency, we
// define the ratio of the inner blocksize to the outer blocksize here, as it
// is used in several places. Note that other operations have analagous inner
// blocksizes, which we also define in terms of the outer storage blocksize,
// or in some cases such as Hessenberg, tridiagonal, and bidiagonal reductions,
// in terms of the system-wide default blocksize.
#define FLA_QR_INNER_TO_OUTER_B_RATIO      (0.25)
#define FLA_LQ_INNER_TO_OUTER_B_RATIO      (0.25)
#define FLA_LU_INNER_TO_OUTER_B_RATIO      (0.25)
#define FLA_UDDATE_INNER_TO_OUTER_B_RATIO  (0.25)
#define FLA_HESS_INNER_TO_OUTER_B_RATIO    (0.25)
#define FLA_TRIDIAG_INNER_TO_OUTER_B_RATIO (0.25)
#define FLA_BIDIAG_INNER_TO_OUTER_B_RATIO  (0.25)
#define FLA_CAQR_INNER_TO_OUTER_B_RATIO    (0.25)



// --- Error-related macro definitions -----------------------------------------

// Useful when determining the relative index base of the error codes.
#define FLA_ERROR_CODE_MIN                    (-10)

// FLA_Error values.
#define FLA_INVALID_SIDE                      (-10)
#define FLA_INVALID_UPLO                      (-11)
#define FLA_INVALID_TRANS                     (-12)
#define FLA_INVALID_TRANS_GIVEN_DATATYPE      (-13)
#define FLA_INVALID_CONJ                      (-14)
#define FLA_INVALID_DIRECT                    (-15)
#define FLA_INVALID_STOREV                    (-16)
#define FLA_INVALID_DATATYPE                  (-17)
#define FLA_INVALID_INTEGER_DATATYPE          (-18)
#define FLA_INVALID_REAL_DATATYPE             (-19)
#define FLA_INVALID_COMPLEX_DATATYPE          (-20)
#define FLA_OBJECT_NOT_INTEGER                (-21)
#define FLA_OBJECT_NOT_REAL                   (-22)
#define FLA_OBJECT_NOT_COMPLEX                (-23)
#define FLA_OBJECT_NOT_SQUARE                 (-24)
#define FLA_OBJECT_NOT_SCALAR                 (-25)
#define FLA_OBJECT_NOT_VECTOR                 (-26)
#define FLA_INCONSISTENT_DATATYPES            (-27)
#define FLA_NONCONFORMAL_DIMENSIONS           (-28)
#define FLA_UNEQUAL_VECTOR_DIMS               (-29)
#define FLA_INVALID_HESSENBERG_INDICES        (-30)
#define FLA_NULL_POINTER                      (-32)
#define FLA_SPECIFIED_OBJ_DIM_MISMATCH        (-33)
#define FLA_INVALID_PIVOT_TYPE                (-35)
#define FLA_MALLOC_RETURNED_NULL_POINTER      (-37)
#define FLA_OBJECT_BASE_BUFFER_MISMATCH       (-38)
#define FLA_OBJECTS_NOT_VERTICALLY_ADJ        (-39)
#define FLA_OBJECTS_NOT_HORIZONTALLY_ADJ      (-40)
#define FLA_ADJACENT_OBJECT_DIM_MISMATCH      (-41)
#define FLA_OBJECTS_NOT_VERTICALLY_ALIGNED    (-42)
#define FLA_OBJECTS_NOT_HORIZONTALLY_ALIGNED  (-43)
#define FLA_INVALID_FLOATING_DATATYPE         (-44)
#define FLA_OBJECT_NOT_FLOATING_POINT         (-45)
#define FLA_INVALID_BLOCKSIZE_VALUE           (-46)
#define FLA_OPEN_RETURNED_ERROR               (-47)
#define FLA_LSEEK_RETURNED_ERROR              (-48)
#define FLA_CLOSE_RETURNED_ERROR              (-49)
#define FLA_UNLINK_RETURNED_ERROR             (-50)
#define FLA_READ_RETURNED_ERROR               (-51)
#define FLA_WRITE_RETURNED_ERROR              (-52)
#define FLA_INVALID_QUADRANT                  (-53)
#define FLA_NOT_YET_IMPLEMENTED               (-54)
#define FLA_EXPECTED_NONNEGATIVE_VALUE        (-55)
#define FLA_SUPERMATRIX_NOT_ENABLED           (-56)
#define FLA_UNDEFINED_ERROR_CODE              (-57)
#define FLA_INVALID_DIAG                      (-58)
#define FLA_INCONSISTENT_OBJECT_PRECISION     (-59)
#define FLA_INVALID_BLOCKSIZE_OBJ             (-60)
#define FLA_VECTOR_DIM_BELOW_MIN              (-61)
#define FLA_PTHREAD_CREATE_RETURNED_ERROR     (-63)
#define FLA_PTHREAD_JOIN_RETURNED_ERROR       (-64)
#define FLA_INVALID_ISGN_VALUE                (-65)
#define FLA_CHOL_FAILED_MATRIX_NOT_SPD        (-67)
#define FLA_INVALID_ELEMTYPE                  (-68)
#define FLA_POSIX_MEMALIGN_FAILED             (-69)
#define FLA_INVALID_SUBMATRIX_DIMS            (-70)
#define FLA_INVALID_SUBMATRIX_OFFSET          (-71)
#define FLA_OBJECT_NOT_SCALAR_ELEMTYPE        (-72)
#define FLA_OBJECT_NOT_MATRIX_ELEMTYPE        (-73)
#define FLA_ENCOUNTERED_NON_POSITIVE_NTHREADS (-74)
#define FLA_INVALID_CONJ_GIVEN_DATATYPE       (-75)
#define FLA_INVALID_COMPLEX_TRANS             (-76)
#define FLA_INVALID_REAL_TRANS                (-77)
#define FLA_INVALID_BLAS_TRANS                (-78)
#define FLA_INVALID_NONCONSTANT_DATATYPE      (-79)
#define FLA_OBJECT_NOT_NONCONSTANT            (-80)
#define FLA_OBJECT_DATATYPES_NOT_EQUAL        (-82)
#define FLA_DIVIDE_BY_ZERO                    (-83)
#define FLA_OBJECT_ELEMTYPES_NOT_EQUAL        (-84)
#define FLA_INVALID_PIVOT_INDEX_RANGE         (-85)
#define FLA_HOUSEH_PANEL_MATRIX_TOO_SMALL     (-86)
#define FLA_INVALID_OBJECT_LENGTH             (-87)
#define FLA_INVALID_OBJECT_WIDTH              (-88)
#define FLA_INVALID_ERROR_CHECKING_LEVEL      (-89)
#define FLA_ATTEMPTED_OVER_REPART_2X2         (-90)
#define FLA_ATTEMPTED_OVER_REPART_2X1         (-91)
#define FLA_ATTEMPTED_OVER_REPART_1X2         (-92)
#define FLA_EXTERNAL_LAPACK_NOT_IMPLEMENTED   (-93)
#define FLA_INVALID_ROW_STRIDE                (-94)
#define FLA_INVALID_COL_STRIDE                (-95)
#define FLA_INVALID_STRIDE_COMBINATION        (-96)
#define FLA_INVALID_VECTOR_DIM                (-97)
#define FLA_EXPECTED_ROW_VECTOR               (-98)
#define FLA_EXPECTED_COL_VECTOR               (-99)
#define FLA_INVALID_INVERSE                   (-100)
#define FLA_MALLOC_GPU_RETURNED_NULL_POINTER  (-101)
#define FLA_INVALID_EVD_TYPE                  (-102)
#define FLA_INVALID_SVD_TYPE                  (-103)
#define FLA_INVALID_MACHVAL                   (-104)
#define FLA_INVALID_DIAG_OFFSET               (-105)
#define FLA_EXPECTED_COL_STORAGE              (-106)
#define FLA_EXPECTED_ROW_STORAGE              (-107)
#define FLA_LAPAC2FLAME_INVALID_RETURN        (-108)
#define FLA_INVALID_SVD_TYPE_COMBINATION      (-109)
#define FLA_INVALID_SVD_TYPE_AND_TRANS_COMBINATION (-110)
#define FLA_OBJECT_NOT_COMPARABLE             (-111)

// Necessary when computing whether an error code is defined.
#define FLA_ERROR_CODE_MAX                    (-111)

// Internal string matrix limits.
#define FLA_MAX_NUM_ERROR_MSGS                 150
#define FLA_MAX_ERROR_MSG_LENGTH               200

// Error code translation and output macro definition.
#define FLA_Check_error_code( code ) \
        FLA_Check_error_code_helper( code, __FILE__, __LINE__ )



// --- Common functions implemented as macros ----------------------------------

#undef min
#define min( x, y ) ( (x) < (y) ? (x) : (y) )

#undef max
#define max( x, y ) ( (x) > (y) ? (x) : (y) )

#undef signof
#define signof( a, b ) ( (b) >= 0 ? (a) : -(a) )

#undef exchange
#define exchange( a, b, temp ) { temp = a; a = b; b = temp; }

// --- Other macro definitions -------------------------------------------------

#define FLA_NEGATE( a ) \
        ( a.base == FLA_ONE.base ? FLA_MINUS_ONE : FLA_ONE )


// end FLA_macro_defs.h
// begin FLA_macro_ptr_defs.h


// begin FLA_type_defs.h


#ifndef FLA_TYPE_DEFS_H
#define FLA_TYPE_DEFS_H

#if   FLA_MULTITHREADING_MODEL == FLA_OPENMP
#ifdef FLA_ENABLE_TIDSP
#include <ti/omp/omp.h> // skipped
#else
#include <omp.h> // skipped
#endif
#elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
#include <pthread.h> // skipped
#endif


// --- Complex type definitions -----------------------------------------------

#ifndef _DEFINED_SCOMPLEX
#define _DEFINED_SCOMPLEX
typedef struct scomplex
{
  float real, imag;
} scomplex;
#endif

#ifndef _DEFINED_DCOMPLEX
#define _DEFINED_DCOMPLEX
typedef struct dcomplex
{
  double real, imag;
} dcomplex;
#endif


// --- Parameter and return type definitions ----------------------------------

typedef int FLA_Bool;
typedef int FLA_Error;
typedef int FLA_Quadrant;
typedef int FLA_Datatype;
typedef int FLA_Elemtype;
typedef int FLA_Side;
typedef int FLA_Uplo;
typedef int FLA_Trans;
typedef int FLA_Conj;
typedef int FLA_Diag;
typedef int FLA_Dimension;
typedef int FLA_Pivot_type;
typedef int FLA_Direct;
typedef int FLA_Store;
typedef int FLA_Matrix_type;
typedef int FLA_Precision;
typedef int FLA_Domain;
typedef int FLA_Inv;
typedef int FLA_Evd_type;
typedef int FLA_Svd_type;
typedef int FLA_Machval;
typedef int FLA_Diag_off;

#ifndef _DEFINED_DIM_T
#define _DEFINED_DIM_T
typedef unsigned long dim_t;
#endif

// --- Intrinsic/assembly definitions ----------------------------------------

#if FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS

#include "pmmintrin.h" // skipped

//typedef double v2df __attribute__ ((vector_size (16)));

typedef union
{
    __m128  v; 
    float   f[4];
} v4sf_t;

typedef union
{
    __m128d v; 
    double  d[2];
} v2df_t;

#endif

// --- FLAME object definitions -----------------------------------------------

typedef struct FLA_Lock_s     FLA_Lock;
typedef struct FLA_RWLock_s   FLA_RWLock;

//#ifdef FLA_ENABLE_MULTITHREADING
struct FLA_Lock_s
{
  // Implementation-specific lock object
#if   FLA_MULTITHREADING_MODEL == FLA_OPENMP
  omp_lock_t       lock;
#elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
  pthread_mutex_t  lock;
#endif
};
struct FLA_RWLock_s
{
  // Implementation-specific lock object
#if   FLA_MULTITHREADING_MODEL == FLA_OPENMP
  omp_lock_t       lock;
#elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
  pthread_rwlock_t lock;
#endif
};
//#endif

#ifdef FLA_ENABLE_SUPERMATRIX
typedef int                   FLASH_Verbose;
typedef int                   FLASH_Data_aff;

typedef struct FLASH_Queue_s  FLASH_Queue;
typedef struct FLASH_Task_s   FLASH_Task;
typedef struct FLASH_Dep_s    FLASH_Dep;
#endif
typedef struct FLASH_Thread_s FLASH_Thread;

typedef struct FLA_Obj_struct
{
  // Basic object description fields
  FLA_Datatype  datatype;
  FLA_Elemtype  elemtype;
  dim_t         m;
  dim_t         n;
  dim_t         rs;
  dim_t         cs;
  dim_t         m_inner;
  dim_t         n_inner;
  unsigned long id;
  dim_t         m_index;
  dim_t         n_index;

  dim_t         n_elem_alloc;
  void*         buffer;
  int           buffer_info;

  FLA_Uplo      uplo;

#ifdef FLA_ENABLE_SUPERMATRIX
  // Fields for supermatrix
  int           n_read_blocks;
  int           n_write_blocks;

  // All the tasks that previously read this block, anti-dependency
  int           n_read_tasks;
  FLASH_Dep*    read_task_head;
  FLASH_Dep*    read_task_tail;

  // Task that last overwrote this block, flow dependency
  FLASH_Task*   write_task;
#endif
} FLA_Base_obj;

typedef struct FLA_Obj_view
{
  // Basic object view description fields
  dim_t         offm;
  dim_t         offn;
  dim_t         m;
  dim_t         n;
  dim_t         m_inner;
  dim_t         n_inner;

  FLA_Base_obj* base;

} FLA_Obj;

#ifdef FLA_ENABLE_SUPERMATRIX
struct FLASH_Queue_s
{
  // Number of tasks currently in queue
  unsigned int  n_tasks;

  // Pointers to head (front) and tail (back) of queue
  FLASH_Task*   head;
  FLASH_Task*   tail;
};

struct FLASH_Task_s
{
  // Execution information
  int           n_ready;

  // Labels
  int           order;
  int           queue;
  int           height;
  int           thread;
  int           cache;
  FLA_Bool      hit;
      
  // Function pointer
  void*         func;

  // Control tree pointer
  void*         cntl;

  // Name of task
  char*         name;

  // GPU enabled task
  FLA_Bool      enabled_gpu;

  // HIP enabled task
  FLA_Bool      enabled_hip;

  // Integer arguments
  int           n_int_args;
  int*          int_arg;

  // Constant FLA_Obj arguments
  int           n_fla_args;
  FLA_Obj*      fla_arg;

  // Input FLA_Obj arguments
  int           n_input_args;
  FLA_Obj*      input_arg;

  // Output FLA_Obj argument
  int           n_output_args;
  FLA_Obj*      output_arg;

  // Number of blocks within all macroblocks
  int           n_macro_args;

  // Number of write after read dependencies
  int           n_war_args;

  // Dependence information
  int           n_dep_args;
  FLASH_Dep*    dep_arg_head;
  FLASH_Dep*    dep_arg_tail;
  
  // Support for a doubly linked list of tasks
  FLASH_Task*   prev_task;
  FLASH_Task*   next_task;

  // Support for a doubly linked list for wait queue
  FLASH_Task*   prev_wait;
  FLASH_Task*   next_wait;
};

struct FLASH_Dep_s
{
  // Task yielding dependency
  FLASH_Task*   task;

  // Support for linked list of FLASH_Deps
  FLASH_Dep*    next_dep;
};
#endif // FLA_ENABLE_SUPERMATRIX

struct FLASH_Thread_s
{
  // The thread's unique identifier
  int       id;

  // Pointer to variables needed to execute SuperMatrix mechanism
  void*     args;

#if FLA_MULTITHREADING_MODEL == FLA_PTHREADS
  // The thread object. Only needed for the POSIX threads implementation.
  pthread_t pthread_obj;
#endif
};

#endif // FLA_TYPE_DEFS_H
// end FLA_type_defs.h

// --- Pointer-accessing FLAME macro definitions ------------------------------------

#define FLA_CONSTANT_I_OFFSET  0
#define FLA_CONSTANT_S_OFFSET  ( sizeof(double) )
#define FLA_CONSTANT_D_OFFSET  ( sizeof(double) + sizeof(double) )
#define FLA_CONSTANT_C_OFFSET  ( sizeof(double) + sizeof(double) + sizeof(double) )
#define FLA_CONSTANT_Z_OFFSET  ( sizeof(double) + sizeof(double) + sizeof(double) + sizeof( scomplex ) )
#define FLA_CONSTANT_SIZE      ( sizeof(double) + sizeof(double) + sizeof(double) + sizeof( scomplex ) + sizeof( dcomplex ) )

#define FLA_INT_PTR( x ) \
  ( ((x).base)->datatype == FLA_CONSTANT ? \
    ( ( int * )      ( ( ( char * )     ((x).base)->buffer ) + FLA_CONSTANT_I_OFFSET             ) ) : \
                     ( ( ( int * )      ((x).base)->buffer ) + ( size_t ) (x).offn * ((x).base)->cs + \
                                                               ( size_t ) (x).offm * ((x).base)->rs ) )

#define FLA_FLOAT_PTR( x ) \
  ( ((x).base)->datatype == FLA_CONSTANT ? \
    ( ( float * )    ( ( ( char * )     ((x).base)->buffer ) + FLA_CONSTANT_S_OFFSET             ) ) : \
                     ( ( ( float * )    ((x).base)->buffer ) + ( size_t ) (x).offn * ((x).base)->cs + \
                                                               ( size_t ) (x).offm * ((x).base)->rs ) )

#define FLA_DOUBLE_PTR( x ) \
  ( ((x).base)->datatype == FLA_CONSTANT ? \
    ( ( double * )   ( ( ( char * )     ((x).base)->buffer ) + FLA_CONSTANT_D_OFFSET             ) ) : \
                     ( ( ( double * )   ((x).base)->buffer ) + ( size_t ) (x).offn * ((x).base)->cs + \
                                                               ( size_t ) (x).offm * ((x).base)->rs ) )

#define FLA_COMPLEX_PTR( x ) \
  ( ((x).base)->datatype == FLA_CONSTANT ? \
    ( ( scomplex * ) ( ( ( char * )     ((x).base)->buffer ) + FLA_CONSTANT_C_OFFSET             ) ) : \
                     ( ( ( scomplex * ) ((x).base)->buffer ) + ( size_t ) (x).offn * ((x).base)->cs + \
                                                               ( size_t ) (x).offm * ((x).base)->rs ) )

#define FLA_DOUBLE_COMPLEX_PTR( x ) \
  ( ((x).base)->datatype == FLA_CONSTANT ? \
    ( ( dcomplex * ) ( ( ( char * )     ((x).base)->buffer ) + FLA_CONSTANT_Z_OFFSET             ) ) : \
                     ( ( ( dcomplex * ) ((x).base)->buffer ) + ( size_t ) (x).offn * ((x).base)->cs + \
                                                               ( size_t ) (x).offm * ((x).base)->rs ) )

// end FLA_macro_ptr_defs.h

  // Include general FLAME type definitions, including those for FLA_Obj.
// begin FLA_type_defs.h


#ifndef FLA_TYPE_DEFS_H
#define FLA_TYPE_DEFS_H

#if   FLA_MULTITHREADING_MODEL == FLA_OPENMP
#ifdef FLA_ENABLE_TIDSP
#include <ti/omp/omp.h> // skipped
#else
#include <omp.h> // skipped
#endif
#elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
#include <pthread.h> // skipped
#endif


// --- Complex type definitions -----------------------------------------------

#ifndef _DEFINED_SCOMPLEX
#define _DEFINED_SCOMPLEX
typedef struct scomplex
{
  float real, imag;
} scomplex;
#endif

#ifndef _DEFINED_DCOMPLEX
#define _DEFINED_DCOMPLEX
typedef struct dcomplex
{
  double real, imag;
} dcomplex;
#endif


// --- Parameter and return type definitions ----------------------------------

typedef int FLA_Bool;
typedef int FLA_Error;
typedef int FLA_Quadrant;
typedef int FLA_Datatype;
typedef int FLA_Elemtype;
typedef int FLA_Side;
typedef int FLA_Uplo;
typedef int FLA_Trans;
typedef int FLA_Conj;
typedef int FLA_Diag;
typedef int FLA_Dimension;
typedef int FLA_Pivot_type;
typedef int FLA_Direct;
typedef int FLA_Store;
typedef int FLA_Matrix_type;
typedef int FLA_Precision;
typedef int FLA_Domain;
typedef int FLA_Inv;
typedef int FLA_Evd_type;
typedef int FLA_Svd_type;
typedef int FLA_Machval;
typedef int FLA_Diag_off;

#ifndef _DEFINED_DIM_T
#define _DEFINED_DIM_T
typedef unsigned long dim_t;
#endif

// --- Intrinsic/assembly definitions ----------------------------------------

#if FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS

#include "pmmintrin.h" // skipped

//typedef double v2df __attribute__ ((vector_size (16)));

typedef union
{
    __m128  v; 
    float   f[4];
} v4sf_t;

typedef union
{
    __m128d v; 
    double  d[2];
} v2df_t;

#endif

// --- FLAME object definitions -----------------------------------------------

typedef struct FLA_Lock_s     FLA_Lock;
typedef struct FLA_RWLock_s   FLA_RWLock;

//#ifdef FLA_ENABLE_MULTITHREADING
struct FLA_Lock_s
{
  // Implementation-specific lock object
#if   FLA_MULTITHREADING_MODEL == FLA_OPENMP
  omp_lock_t       lock;
#elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
  pthread_mutex_t  lock;
#endif
};
struct FLA_RWLock_s
{
  // Implementation-specific lock object
#if   FLA_MULTITHREADING_MODEL == FLA_OPENMP
  omp_lock_t       lock;
#elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
  pthread_rwlock_t lock;
#endif
};
//#endif

#ifdef FLA_ENABLE_SUPERMATRIX
typedef int                   FLASH_Verbose;
typedef int                   FLASH_Data_aff;

typedef struct FLASH_Queue_s  FLASH_Queue;
typedef struct FLASH_Task_s   FLASH_Task;
typedef struct FLASH_Dep_s    FLASH_Dep;
#endif
typedef struct FLASH_Thread_s FLASH_Thread;

typedef struct FLA_Obj_struct
{
  // Basic object description fields
  FLA_Datatype  datatype;
  FLA_Elemtype  elemtype;
  dim_t         m;
  dim_t         n;
  dim_t         rs;
  dim_t         cs;
  dim_t         m_inner;
  dim_t         n_inner;
  unsigned long id;
  dim_t         m_index;
  dim_t         n_index;

  dim_t         n_elem_alloc;
  void*         buffer;
  int           buffer_info;

  FLA_Uplo      uplo;

#ifdef FLA_ENABLE_SUPERMATRIX
  // Fields for supermatrix
  int           n_read_blocks;
  int           n_write_blocks;

  // All the tasks that previously read this block, anti-dependency
  int           n_read_tasks;
  FLASH_Dep*    read_task_head;
  FLASH_Dep*    read_task_tail;

  // Task that last overwrote this block, flow dependency
  FLASH_Task*   write_task;
#endif
} FLA_Base_obj;

typedef struct FLA_Obj_view
{
  // Basic object view description fields
  dim_t         offm;
  dim_t         offn;
  dim_t         m;
  dim_t         n;
  dim_t         m_inner;
  dim_t         n_inner;

  FLA_Base_obj* base;

} FLA_Obj;

#ifdef FLA_ENABLE_SUPERMATRIX
struct FLASH_Queue_s
{
  // Number of tasks currently in queue
  unsigned int  n_tasks;

  // Pointers to head (front) and tail (back) of queue
  FLASH_Task*   head;
  FLASH_Task*   tail;
};

struct FLASH_Task_s
{
  // Execution information
  int           n_ready;

  // Labels
  int           order;
  int           queue;
  int           height;
  int           thread;
  int           cache;
  FLA_Bool      hit;
      
  // Function pointer
  void*         func;

  // Control tree pointer
  void*         cntl;

  // Name of task
  char*         name;

  // GPU enabled task
  FLA_Bool      enabled_gpu;

  // HIP enabled task
  FLA_Bool      enabled_hip;

  // Integer arguments
  int           n_int_args;
  int*          int_arg;

  // Constant FLA_Obj arguments
  int           n_fla_args;
  FLA_Obj*      fla_arg;

  // Input FLA_Obj arguments
  int           n_input_args;
  FLA_Obj*      input_arg;

  // Output FLA_Obj argument
  int           n_output_args;
  FLA_Obj*      output_arg;

  // Number of blocks within all macroblocks
  int           n_macro_args;

  // Number of write after read dependencies
  int           n_war_args;

  // Dependence information
  int           n_dep_args;
  FLASH_Dep*    dep_arg_head;
  FLASH_Dep*    dep_arg_tail;
  
  // Support for a doubly linked list of tasks
  FLASH_Task*   prev_task;
  FLASH_Task*   next_task;

  // Support for a doubly linked list for wait queue
  FLASH_Task*   prev_wait;
  FLASH_Task*   next_wait;
};

struct FLASH_Dep_s
{
  // Task yielding dependency
  FLASH_Task*   task;

  // Support for linked list of FLASH_Deps
  FLASH_Dep*    next_dep;
};
#endif // FLA_ENABLE_SUPERMATRIX

struct FLASH_Thread_s
{
  // The thread's unique identifier
  int       id;

  // Pointer to variables needed to execute SuperMatrix mechanism
  void*     args;

#if FLA_MULTITHREADING_MODEL == FLA_PTHREADS
  // The thread object. Only needed for the POSIX threads implementation.
  pthread_t pthread_obj;
#endif
};

#endif // FLA_TYPE_DEFS_H
// end FLA_type_defs.h

  // Include "extern" definitions for global FLAME scalar constants.
// begin FLA_extern_defs.h


#ifndef FLA_EXTERN_DEFS_H
#define FLA_EXTERN_DEFS_H

extern FLA_Obj FLA_THREE;
extern FLA_Obj FLA_TWO;
extern FLA_Obj FLA_ONE;
extern FLA_Obj FLA_ONE_HALF;
extern FLA_Obj FLA_ZERO;
extern FLA_Obj FLA_MINUS_ONE_HALF;
extern FLA_Obj FLA_MINUS_ONE;
extern FLA_Obj FLA_MINUS_TWO;
extern FLA_Obj FLA_MINUS_THREE;

extern FLA_Obj FLA_EPSILON;
extern FLA_Obj FLA_SAFE_MIN;
extern FLA_Obj FLA_SAFE_MIN_SQUARE;
extern FLA_Obj FLA_SAFE_INV_MIN;
extern FLA_Obj FLA_SAFE_INV_MIN_SQUARE;
extern FLA_Obj FLA_UNDERFLOW_THRES;
extern FLA_Obj FLA_OVERFLOW_THRES;
extern FLA_Obj FLA_UNDERFLOW_SQUARE_THRES;
extern FLA_Obj FLA_OVERFLOW_SQUARE_THRES;

extern const float    fzero;
extern const double   dzero;
extern const scomplex czero;
extern const dcomplex zzero;

#endif

// end FLA_extern_defs.h

  // Include control tree structure definitions, utility prototypes, and
  // initialization prototypes.
// begin FLA_Cntl.h


// Shared definitions

typedef struct FLA_Blocksize_s
{
	dim_t s;
	dim_t d;
	dim_t c;
	dim_t z;
} fla_blocksize_t;

#define FLA_SUBPROBLEM                  0
#define FLA_UNBLOCKED_EXTERN           10
#define FLA_BLOCKED_EXTERN             13

#define FLA_UNB_VAR_OFFSET             40
#define FLA_OPT_VAR_OFFSET             80
#define FLA_BLK_VAR_OFFSET            120
#define FLA_BLF_VAR_OFFSET            160

#define FLA_UNBLOCKED_VARIANT1        (FLA_UNB_VAR_OFFSET+1)
#define FLA_UNBLOCKED_VARIANT2        (FLA_UNB_VAR_OFFSET+2)
#define FLA_UNBLOCKED_VARIANT3        (FLA_UNB_VAR_OFFSET+3)
#define FLA_UNBLOCKED_VARIANT4        (FLA_UNB_VAR_OFFSET+4)
#define FLA_UNBLOCKED_VARIANT5        (FLA_UNB_VAR_OFFSET+5)
#define FLA_UNBLOCKED_VARIANT6        (FLA_UNB_VAR_OFFSET+6)
#define FLA_UNBLOCKED_VARIANT7        (FLA_UNB_VAR_OFFSET+7)
#define FLA_UNBLOCKED_VARIANT8        (FLA_UNB_VAR_OFFSET+8)
#define FLA_UNBLOCKED_VARIANT9        (FLA_UNB_VAR_OFFSET+9)
#define FLA_UNBLOCKED_VARIANT10       (FLA_UNB_VAR_OFFSET+10)

#define FLA_UNB_OPT_VARIANT1          (FLA_OPT_VAR_OFFSET+1)
#define FLA_UNB_OPT_VARIANT2          (FLA_OPT_VAR_OFFSET+2)
#define FLA_UNB_OPT_VARIANT3          (FLA_OPT_VAR_OFFSET+3)
#define FLA_UNB_OPT_VARIANT4          (FLA_OPT_VAR_OFFSET+4)
#define FLA_UNB_OPT_VARIANT5          (FLA_OPT_VAR_OFFSET+5)
#define FLA_UNB_OPT_VARIANT6          (FLA_OPT_VAR_OFFSET+6)
#define FLA_UNB_OPT_VARIANT7          (FLA_OPT_VAR_OFFSET+7)
#define FLA_UNB_OPT_VARIANT8          (FLA_OPT_VAR_OFFSET+8)
#define FLA_UNB_OPT_VARIANT9          (FLA_OPT_VAR_OFFSET+9)
#define FLA_UNB_OPT_VARIANT10         (FLA_OPT_VAR_OFFSET+10)

#define FLA_BLOCKED_VARIANT1          (FLA_BLK_VAR_OFFSET+1)
#define FLA_BLOCKED_VARIANT2          (FLA_BLK_VAR_OFFSET+2)
#define FLA_BLOCKED_VARIANT3          (FLA_BLK_VAR_OFFSET+3)
#define FLA_BLOCKED_VARIANT4          (FLA_BLK_VAR_OFFSET+4)
#define FLA_BLOCKED_VARIANT5          (FLA_BLK_VAR_OFFSET+5)
#define FLA_BLOCKED_VARIANT6          (FLA_BLK_VAR_OFFSET+6)
#define FLA_BLOCKED_VARIANT7          (FLA_BLK_VAR_OFFSET+7)
#define FLA_BLOCKED_VARIANT8          (FLA_BLK_VAR_OFFSET+8)
#define FLA_BLOCKED_VARIANT9          (FLA_BLK_VAR_OFFSET+9)
#define FLA_BLOCKED_VARIANT10         (FLA_BLK_VAR_OFFSET+10)
#define FLA_BLOCKED_VARIANT11         (FLA_BLK_VAR_OFFSET+11)
#define FLA_BLOCKED_VARIANT12         (FLA_BLK_VAR_OFFSET+12)
#define FLA_BLOCKED_VARIANT13         (FLA_BLK_VAR_OFFSET+13)
#define FLA_BLOCKED_VARIANT14         (FLA_BLK_VAR_OFFSET+14)
#define FLA_BLOCKED_VARIANT15         (FLA_BLK_VAR_OFFSET+15)
#define FLA_BLOCKED_VARIANT16         (FLA_BLK_VAR_OFFSET+16)
#define FLA_BLOCKED_VARIANT17         (FLA_BLK_VAR_OFFSET+17)
#define FLA_BLOCKED_VARIANT18         (FLA_BLK_VAR_OFFSET+18)
#define FLA_BLOCKED_VARIANT19         (FLA_BLK_VAR_OFFSET+19)
#define FLA_BLOCKED_VARIANT20         (FLA_BLK_VAR_OFFSET+20)

#define FLA_BLK_FUS_VARIANT1          (FLA_BLF_VAR_OFFSET+1)
#define FLA_BLK_FUS_VARIANT2          (FLA_BLF_VAR_OFFSET+2)
#define FLA_BLK_FUS_VARIANT3          (FLA_BLF_VAR_OFFSET+3)
#define FLA_BLK_FUS_VARIANT4          (FLA_BLF_VAR_OFFSET+4)
#define FLA_BLK_FUS_VARIANT5          (FLA_BLF_VAR_OFFSET+5)
#define FLA_BLK_FUS_VARIANT6          (FLA_BLF_VAR_OFFSET+6)
#define FLA_BLK_FUS_VARIANT7          (FLA_BLF_VAR_OFFSET+7)
#define FLA_BLK_FUS_VARIANT8          (FLA_BLF_VAR_OFFSET+8)
#define FLA_BLK_FUS_VARIANT9          (FLA_BLF_VAR_OFFSET+9)
#define FLA_BLK_FUS_VARIANT10         (FLA_BLF_VAR_OFFSET+10)

#define FLA_Cntl_matrix_type( cntl )  cntl->matrix_type
#define FLA_Cntl_blocksize( cntl )    cntl->blocksize
#define FLA_Cntl_variant( cntl )      cntl->variant

void FLA_Cntl_obj_free( void* cntl );


// Include the control tree definitions for each class of operation.
// begin FLA_Cntl_blas1.h



//
// Level-1 BLAS
//

struct fla_axpy_s
{
	FLA_Matrix_type    matrix_type;
	int                variant;
	fla_blocksize_t*   blocksize;
	struct fla_axpy_s* sub_axpy;
};
typedef struct fla_axpy_s fla_axpy_t;


struct fla_axpyt_s
{
	FLA_Matrix_type     matrix_type;
	int                 variant;
	fla_blocksize_t*    blocksize;
	struct fla_axpyt_s* sub_axpyt;
};
typedef struct fla_axpyt_s fla_axpyt_t;


struct fla_copy_s
{
	FLA_Matrix_type    matrix_type;
	int                variant;
	fla_blocksize_t*   blocksize;
	struct fla_copy_s* sub_copy;
};
typedef struct fla_copy_s fla_copy_t;


struct fla_copyt_s
{
	FLA_Matrix_type     matrix_type;
	int                 variant;
	fla_blocksize_t*    blocksize;
	struct fla_copyt_s* sub_copyt;
};
typedef struct fla_copyt_s fla_copyt_t;


struct fla_copyr_s
{
	FLA_Matrix_type     matrix_type;
	int                 variant;
	fla_blocksize_t*    blocksize;
	struct fla_copyr_s* sub_copyr;
	struct fla_copy_s*  sub_copy;
};
typedef struct fla_copyr_s fla_copyr_t;


struct fla_scal_s
{
	FLA_Matrix_type    matrix_type;
	int                variant;
	fla_blocksize_t*   blocksize;
	struct fla_scal_s* sub_scal;
};
typedef struct fla_scal_s fla_scal_t;


struct fla_scalr_s
{
	FLA_Matrix_type     matrix_type;
	int                 variant;
	fla_blocksize_t*    blocksize;
	struct fla_scalr_s* sub_scalr;
	struct fla_scal_s*  sub_scal;
};
typedef struct fla_scalr_s fla_scalr_t;


struct fla_swap_s
{
	FLA_Matrix_type    matrix_type;
	int                variant;
	fla_blocksize_t*   blocksize;
	struct fla_swap_s* sub_swap;
};
typedef struct fla_swap_s fla_swap_t;


struct fla_tpose_s
{
	FLA_Matrix_type     matrix_type;
	int                 variant;
	fla_blocksize_t*    blocksize;
	struct fla_tpose_s* sub_trans;
	struct fla_swap_s*  sub_swap;
};
typedef struct fla_tpose_s fla_tpose_t;


#define FLA_Cntl_sub_axpy( cntl )     cntl->sub_axpy
#define FLA_Cntl_sub_axpy1( cntl )    cntl->sub_axpy1
#define FLA_Cntl_sub_axpy2( cntl )    cntl->sub_axpy2
#define FLA_Cntl_sub_axpy3( cntl )    cntl->sub_axpy3
#define FLA_Cntl_sub_axpyt( cntl )    cntl->sub_axpyt
#define FLA_Cntl_sub_copy( cntl )     cntl->sub_copy
#define FLA_Cntl_sub_copyt( cntl )    cntl->sub_copyt
#define FLA_Cntl_sub_copyr( cntl )    cntl->sub_copyr
#define FLA_Cntl_sub_scal( cntl )     cntl->sub_scal
#define FLA_Cntl_sub_scalr( cntl )    cntl->sub_scalr
#define FLA_Cntl_sub_swap( cntl )     cntl->sub_swap
#define FLA_Cntl_sub_trans( cntl )    cntl->sub_trans


fla_axpy_t* FLA_Cntl_axpy_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_axpy_t*      sub_axpy );
fla_axpyt_t* FLA_Cntl_axpyt_obj_create( FLA_Matrix_type  matrix_type,
                                        int              variant,
                                        fla_blocksize_t* blocksize,
                                        fla_axpyt_t*     sub_axpyt );
fla_copy_t* FLA_Cntl_copy_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_copy_t*      sub_copy );
fla_copyt_t* FLA_Cntl_copyt_obj_create( FLA_Matrix_type  matrix_type,
                                        int              variant,
                                        fla_blocksize_t* blocksize,
                                        fla_copyt_t*     sub_copyt );
fla_copyr_t* FLA_Cntl_copyr_obj_create( FLA_Matrix_type  matrix_type,
                                        int              variant,
                                        fla_blocksize_t* blocksize,
                                        fla_copyr_t*     sub_copyr,
                                        fla_copy_t*      sub_copy );
fla_scal_t* FLA_Cntl_scal_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_scal_t*      sub_scal );
fla_scalr_t* FLA_Cntl_scalr_obj_create( FLA_Matrix_type  matrix_type,
                                        int              variant,
                                        fla_blocksize_t* blocksize,
                                        fla_scalr_t*     sub_scalr,
                                        fla_scal_t*      sub_scal );
fla_swap_t* FLA_Cntl_swap_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_swap_t*      sub_swap );
fla_tpose_t* FLA_Cntl_tpose_obj_create( FLA_Matrix_type  matrix_type,
                                        int              variant,
                                        fla_blocksize_t* blocksize,
                                        fla_tpose_t*     sub_trans,
                                        fla_swap_t*      sub_swap );

// end FLA_Cntl_blas1.h
// begin FLA_Cntl_blas2.h



//
// Level-2 BLAS
//

struct fla_gemv_s
{
	FLA_Matrix_type    matrix_type;
	int                variant;
	fla_blocksize_t*   blocksize;
	struct fla_scal_s* sub_scal;
	struct fla_gemv_s* sub_gemv;
};
typedef struct fla_gemv_s fla_gemv_t;

struct fla_trsv_s
{
	FLA_Matrix_type    matrix_type;
	int                variant;
	fla_blocksize_t*   blocksize;
	struct fla_trsv_s* sub_trsv;
	struct fla_gemv_s* sub_gemv;
};
typedef struct fla_trsv_s fla_trsv_t;


#define FLA_Cntl_sub_gemv( cntl )     cntl->sub_gemv
#define FLA_Cntl_sub_trsv( cntl )     cntl->sub_trsv


fla_gemv_t* FLA_Cntl_gemv_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_scal_t*      sub_scal,
                                      fla_gemv_t*      sub_gemv );
fla_trsv_t* FLA_Cntl_trsv_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_trsv_t*      sub_trsv,
                                      fla_gemv_t*      sub_gemv );

// end FLA_Cntl_blas2.h
// begin FLA_Cntl_blas3.h



//
// Level-3 BLAS
//

struct fla_gemm_s
{
	FLA_Matrix_type    matrix_type;
	int                variant;
	fla_blocksize_t*   blocksize;
	struct fla_scal_s* sub_scal;
	struct fla_gemm_s* sub_gemm;
};
typedef struct fla_gemm_s fla_gemm_t;


struct fla_hemm_s
{
	FLA_Matrix_type    matrix_type;
	int                variant;
	fla_blocksize_t*   blocksize;
	struct fla_scal_s* sub_scal;
	struct fla_hemm_s* sub_hemm;
	struct fla_gemm_s* sub_gemm1;
	struct fla_gemm_s* sub_gemm2;
};
typedef struct fla_hemm_s fla_hemm_t;


struct fla_herk_s
{
	FLA_Matrix_type     matrix_type;
	int                 variant;
	fla_blocksize_t*    blocksize;
	struct fla_scalr_s* sub_scalr;
	struct fla_herk_s*  sub_herk;
	struct fla_gemm_s*  sub_gemm;
};
typedef struct fla_herk_s fla_herk_t;


struct fla_her2k_s
{
	FLA_Matrix_type     matrix_type;
	int                 variant;
	fla_blocksize_t*    blocksize;
	struct fla_scalr_s* sub_scalr;
	struct fla_her2k_s* sub_her2k;
	struct fla_gemm_s*  sub_gemm1;
	struct fla_gemm_s*  sub_gemm2;
};
typedef struct fla_her2k_s fla_her2k_t;


struct fla_symm_s
{
	FLA_Matrix_type    matrix_type;
	int                variant;
	fla_blocksize_t*   blocksize;
	struct fla_scal_s* sub_scal;
	struct fla_symm_s* sub_symm;
	struct fla_gemm_s* sub_gemm1;
	struct fla_gemm_s* sub_gemm2;
};
typedef struct fla_symm_s fla_symm_t;


struct fla_syrk_s
{
	FLA_Matrix_type     matrix_type;
	int                 variant;
	fla_blocksize_t*    blocksize;
	struct fla_scalr_s* sub_scalr;
	struct fla_syrk_s*  sub_syrk;
	struct fla_gemm_s*  sub_gemm;
};
typedef struct fla_syrk_s fla_syrk_t;


struct fla_syr2k_s
{
	FLA_Matrix_type     matrix_type;
	int                 variant;
	fla_blocksize_t*    blocksize;
	struct fla_scalr_s* sub_scalr;
	struct fla_syr2k_s* sub_syr2k;
	struct fla_gemm_s*  sub_gemm1;
	struct fla_gemm_s*  sub_gemm2;
};
typedef struct fla_syr2k_s fla_syr2k_t;


struct fla_trmm_s
{
	FLA_Matrix_type    matrix_type;
	int                variant;
	fla_blocksize_t*   blocksize;
	struct fla_scal_s* sub_scal;
	struct fla_trmm_s* sub_trmm;
	struct fla_gemm_s* sub_gemm;
};
typedef struct fla_trmm_s fla_trmm_t;


struct fla_trsm_s
{
	FLA_Matrix_type    matrix_type;
	int                variant;
	fla_blocksize_t*   blocksize;
	struct fla_scal_s* sub_scal;
	struct fla_trsm_s* sub_trsm;
	struct fla_gemm_s* sub_gemm;
};
typedef struct fla_trsm_s fla_trsm_t;


#define FLA_Cntl_sub_gemm( cntl )     cntl->sub_gemm
#define FLA_Cntl_sub_gemm1( cntl )    cntl->sub_gemm1
#define FLA_Cntl_sub_gemm2( cntl )    cntl->sub_gemm2
#define FLA_Cntl_sub_gemm3( cntl )    cntl->sub_gemm3
#define FLA_Cntl_sub_gemm4( cntl )    cntl->sub_gemm4
#define FLA_Cntl_sub_gemm5( cntl )    cntl->sub_gemm5
#define FLA_Cntl_sub_gemm6( cntl )    cntl->sub_gemm6
#define FLA_Cntl_sub_gemm7( cntl )    cntl->sub_gemm7
#define FLA_Cntl_sub_gemm8( cntl )    cntl->sub_gemm8
#define FLA_Cntl_sub_hemm( cntl )     cntl->sub_hemm
#define FLA_Cntl_sub_hemm1( cntl )    cntl->sub_hemm1
#define FLA_Cntl_sub_hemm2( cntl )    cntl->sub_hemm2
#define FLA_Cntl_sub_herk( cntl )     cntl->sub_herk
#define FLA_Cntl_sub_herk1( cntl )    cntl->sub_herk1
#define FLA_Cntl_sub_herk2( cntl )    cntl->sub_herk2
#define FLA_Cntl_sub_her2k( cntl )    cntl->sub_her2k
#define FLA_Cntl_sub_symm( cntl )     cntl->sub_symm
#define FLA_Cntl_sub_syrk( cntl )     cntl->sub_syrk
#define FLA_Cntl_sub_syr2k( cntl )    cntl->sub_syr2k
#define FLA_Cntl_sub_trmm( cntl )     cntl->sub_trmm
#define FLA_Cntl_sub_trmm1( cntl )    cntl->sub_trmm1
#define FLA_Cntl_sub_trmm2( cntl )    cntl->sub_trmm2
#define FLA_Cntl_sub_trsm( cntl )     cntl->sub_trsm
#define FLA_Cntl_sub_trsm1( cntl )    cntl->sub_trsm1
#define FLA_Cntl_sub_trsm2( cntl )    cntl->sub_trsm2
#define FLA_Cntl_sub_trsm3( cntl )    cntl->sub_trsm3
#define FLA_Cntl_sub_trsm4( cntl )    cntl->sub_trsm4


fla_gemm_t* FLA_Cntl_gemm_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_scal_t*      sub_scal,
                                      fla_gemm_t*      sub_gemm );
fla_hemm_t* FLA_Cntl_hemm_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_scal_t*      sub_scal,
                                      fla_hemm_t*      sub_hemm,
                                      fla_gemm_t*      sub_gemm1,
                                      fla_gemm_t*      sub_gemm2 );
fla_herk_t* FLA_Cntl_herk_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_scalr_t*     sub_scalr,
                                      fla_herk_t*      sub_herk,
                                      fla_gemm_t*      sub_gemm );
fla_her2k_t* FLA_Cntl_her2k_obj_create( FLA_Matrix_type  matrix_type,
                                        int              variant,
                                        fla_blocksize_t* blocksize,
                                        fla_scalr_t*     sub_scalr,
                                        fla_her2k_t*     sub_her2k,
                                        fla_gemm_t*      sub_gemm1,
                                        fla_gemm_t*      sub_gemm2 );
fla_symm_t* FLA_Cntl_symm_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_scal_t*      sub_scal,
                                      fla_symm_t*      sub_symm,
                                      fla_gemm_t*      sub_gemm1,
                                      fla_gemm_t*      sub_gemm2 );
fla_syrk_t* FLA_Cntl_syrk_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_scalr_t*     sub_scalr,
                                      fla_syrk_t*      sub_syrk,
                                      fla_gemm_t*      sub_gemm );
fla_syr2k_t* FLA_Cntl_syr2k_obj_create( FLA_Matrix_type  matrix_type,
                                        int              variant,
                                        fla_blocksize_t* blocksize,
                                        fla_scalr_t*     sub_scalr,
                                        fla_syr2k_t*     sub_syr2k,
                                        fla_gemm_t*      sub_gemm1,
                                        fla_gemm_t*      sub_gemm2 );
fla_trmm_t* FLA_Cntl_trmm_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_scal_t*      sub_scal,
                                      fla_trmm_t*      sub_trmm,
                                      fla_gemm_t*      sub_gemm );
fla_trsm_t* FLA_Cntl_trsm_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_scal_t*      sub_scal,
                                      fla_trsm_t*      sub_trsm,
                                      fla_gemm_t*      sub_gemm );

// end FLA_Cntl_blas3.h
// begin FLA_Cntl_lapack.h



//
// LAPACK-level
//

struct fla_chol_s
{
	FLA_Matrix_type    matrix_type;
	int                variant;
	fla_blocksize_t*   blocksize;
	struct fla_chol_s* sub_chol;
	struct fla_herk_s* sub_herk;
	struct fla_trsm_s* sub_trsm;
	struct fla_gemm_s* sub_gemm;
};
typedef struct fla_chol_s fla_chol_t;


struct fla_ttmm_s
{
	FLA_Matrix_type    matrix_type;
	int                variant;
	fla_blocksize_t*   blocksize;
	struct fla_ttmm_s* sub_ttmm;
	struct fla_herk_s* sub_herk;
	struct fla_trmm_s* sub_trmm;
	struct fla_gemm_s* sub_gemm;
};
typedef struct fla_ttmm_s fla_ttmm_t;


struct fla_appiv_s
{
	FLA_Matrix_type     matrix_type;
	int                 variant;
	fla_blocksize_t*    blocksize;
	struct fla_appiv_s* sub_appiv;
};
typedef struct fla_appiv_s fla_appiv_t;


struct fla_lu_s
{
	FLA_Matrix_type     matrix_type;
	int                 variant;
	fla_blocksize_t*    blocksize;
	struct fla_lu_s*    sub_lu;
	struct fla_gemm_s*  sub_gemm1;
	struct fla_gemm_s*  sub_gemm2;
	struct fla_gemm_s*  sub_gemm3;
	struct fla_trsm_s*  sub_trsm1;
	struct fla_trsm_s*  sub_trsm2;
	struct fla_appiv_s* sub_appiv1;
	struct fla_appiv_s* sub_appiv2;
};
typedef struct fla_lu_s fla_lu_t;


struct fla_qr_ut_s
{
	FLA_Matrix_type     matrix_type;
	int                 variant;
	fla_blocksize_t*    blocksize;
	struct fla_qr_ut_s* sub_qrut;
	struct fla_apqut_s* sub_apqut;
};
typedef struct fla_qr_ut_s fla_qrut_t;


struct fla_qr2_ut_s
{
	FLA_Matrix_type        matrix_type;
	int                    variant;
	fla_blocksize_t*       blocksize;
	struct fla_qr2_ut_s*   sub_qr2ut;
	struct fla_gemm_s*     sub_gemm1;
	struct fla_gemm_s*     sub_gemm2;
	struct fla_trsm_s*     sub_trsm;
	struct fla_copy_s*     sub_copy;
	struct fla_axpy_s*     sub_axpy;
};
typedef struct fla_qr2_ut_s fla_qr2ut_t;


struct fla_lq_ut_s
{
	FLA_Matrix_type     matrix_type;
	int                 variant;
	fla_blocksize_t*    blocksize;
	struct fla_lq_ut_s* sub_lqut;
	struct fla_apqut_s* sub_apqut;
};
typedef struct fla_lq_ut_s fla_lqut_t;

struct fla_caqr2ut_s
{
	FLA_Matrix_type        matrix_type;
	int                    variant;
	fla_blocksize_t*       blocksize;
	struct fla_caqr2ut_s*  sub_caqr2ut;
	struct fla_gemm_s*     sub_gemm1;
	struct fla_gemm_s*     sub_gemm2;
	struct fla_trmm_s*     sub_trmm1;
	struct fla_trmm_s*     sub_trmm2;
	struct fla_trsm_s*     sub_trsm;
	struct fla_axpy_s*     sub_axpy1;
	struct fla_axpy_s*     sub_axpy2;
	struct fla_axpy_s*     sub_axpy3;
	struct fla_copy_s*     sub_copy;
};
typedef struct fla_caqr2ut_s fla_caqr2ut_t;


struct fla_hess_ut_s
{
	FLA_Matrix_type       matrix_type;
	int                   variant;
	fla_blocksize_t*      blocksize;
};
typedef struct fla_hess_ut_s fla_hessut_t;

struct fla_tridiag_ut_s
{
	FLA_Matrix_type       matrix_type;
	int                   variant;
	fla_blocksize_t*      blocksize;
};
typedef struct fla_tridiag_ut_s fla_tridiagut_t;

struct fla_bidiag_ut_s
{
	FLA_Matrix_type       matrix_type;
	int                   variant;
	fla_blocksize_t*      blocksize;
};
typedef struct fla_bidiag_ut_s fla_bidiagut_t;

struct fla_trinv_s
{
	FLA_Matrix_type     matrix_type;
	int                 variant;
	fla_blocksize_t*    blocksize;
	struct fla_trinv_s* sub_trinv;
	struct fla_gemm_s*  sub_gemm;
	struct fla_trmm_s*  sub_trmm;
	struct fla_trsm_s*  sub_trsm1;
	struct fla_trsm_s*  sub_trsm2;
};
typedef struct fla_trinv_s fla_trinv_t;


struct fla_sylv_s
{
	FLA_Matrix_type     matrix_type;
	int                 variant;
	fla_blocksize_t*    blocksize;
	struct fla_sylv_s*  sub_sylv1;
	struct fla_sylv_s*  sub_sylv2;
	struct fla_sylv_s*  sub_sylv3;
	struct fla_gemm_s*  sub_gemm1;
	struct fla_gemm_s*  sub_gemm2;
	struct fla_gemm_s*  sub_gemm3;
	struct fla_gemm_s*  sub_gemm4;
	struct fla_gemm_s*  sub_gemm5;
	struct fla_gemm_s*  sub_gemm6;
	struct fla_gemm_s*  sub_gemm7;
	struct fla_gemm_s*  sub_gemm8;
};
typedef struct fla_sylv_s fla_sylv_t;


struct fla_lyap_s
{
	FLA_Matrix_type     matrix_type;
	int                 variant;
	fla_blocksize_t*    blocksize;
	struct fla_scal_s*  sub_scal;
	struct fla_lyap_s*  sub_lyap;
	struct fla_sylv_s*  sub_sylv;
	struct fla_gemm_s*  sub_gemm1;
	struct fla_gemm_s*  sub_gemm2;
	struct fla_hemm_s*  sub_hemm;
	struct fla_her2k_s* sub_her2k;
};
typedef struct fla_lyap_s fla_lyap_t;


struct fla_spdinv_s
{
	FLA_Matrix_type     matrix_type;
	int                 variant;
	fla_blocksize_t*    blocksize;
	struct fla_chol_s*  sub_chol;
	struct fla_trinv_s* sub_trinv;
	struct fla_ttmm_s*  sub_ttmm;
};
typedef struct fla_spdinv_s fla_spdinv_t;


struct fla_apqut_s
{
	FLA_Matrix_type      matrix_type;
	int                  variant;
	fla_blocksize_t*     blocksize;
	struct fla_apqut_s*  sub_apqut;
	struct fla_trmm_s*   sub_trmm1;
	struct fla_trmm_s*   sub_trmm2;
	struct fla_gemm_s*   sub_gemm1;
	struct fla_gemm_s*   sub_gemm2;
	struct fla_trsm_s*   sub_trsm;
	struct fla_copyt_s*  sub_copyt;
	struct fla_axpyt_s*  sub_axpyt;
};
typedef struct fla_apqut_s fla_apqut_t;


struct fla_apq2ut_s
{
	FLA_Matrix_type       matrix_type;
	int                   variant;
	fla_blocksize_t*      blocksize;
	struct fla_apq2ut_s*  sub_apq2ut;
	struct fla_gemm_s*    sub_gemm1;
	struct fla_gemm_s*    sub_gemm2;
	struct fla_trsm_s*    sub_trsm;
	struct fla_copyt_s*   sub_copyt;
	struct fla_axpyt_s*   sub_axpyt;
};
typedef struct fla_apq2ut_s fla_apq2ut_t;


struct fla_caqrutinc_s
{
	FLA_Matrix_type        matrix_type;
	int                    variant;
	fla_blocksize_t*       blocksize;
	struct fla_caqr2ut_s*  sub_caqr2ut;
	struct fla_apcaq2ut_s* sub_apcaq2ut;
};
typedef struct fla_caqrutinc_s fla_caqrutinc_t;


struct fla_apcaqutinc_s
{
	FLA_Matrix_type        matrix_type;
	int                    variant;
	fla_blocksize_t*       blocksize;
	struct fla_apcaq2ut_s* sub_apcaq2ut;
};
typedef struct fla_apcaqutinc_s fla_apcaqutinc_t;


struct fla_apcaq2ut_s
{
	FLA_Matrix_type        matrix_type;
	int                    variant;
	fla_blocksize_t*       blocksize;
	struct fla_apcaq2ut_s* sub_apcaq2ut;
	struct fla_gemm_s*     sub_gemm1;
	struct fla_gemm_s*     sub_gemm2;
	struct fla_trmm_s*     sub_trmm1;
	struct fla_trmm_s*     sub_trmm2;
	struct fla_trsm_s*     sub_trsm;
	struct fla_axpy_s*     sub_axpy1;
	struct fla_axpy_s*     sub_axpy2;
	struct fla_axpy_s*     sub_axpy3;
	struct fla_copy_s*     sub_copy;
};
typedef struct fla_apcaq2ut_s fla_apcaq2ut_t;


struct fla_qr_ut_inc_s
{
	FLA_Matrix_type        matrix_type;
	int                    variant;
	fla_blocksize_t*       blocksize;
	struct fla_qr_ut_s*    sub_qrut;
	struct fla_qr2_ut_s*   sub_qr2ut;
	struct fla_apqut_s*    sub_apqut;
	struct fla_apq2ut_s*   sub_apq2ut;
};
typedef struct fla_qr_ut_inc_s fla_qrutinc_t;


struct fla_apqutinc_s
{
	FLA_Matrix_type        matrix_type;
	int                    variant;
	fla_blocksize_t*       blocksize;
	struct fla_apqut_s*    sub_apqut;
	struct fla_apq2ut_s*   sub_apq2ut;
};
typedef struct fla_apqutinc_s fla_apqutinc_t;


struct fla_uddateut_s
{
	FLA_Matrix_type        matrix_type;
	int                    variant;
	fla_blocksize_t*       blocksize;
	struct fla_uddateut_s* sub_uddateut;
	struct fla_apqudut_s*  sub_apqudut;
};
typedef struct fla_uddateut_s fla_uddateut_t;


struct fla_apqudut_s
{
	FLA_Matrix_type       matrix_type;
	int                   variant;
	fla_blocksize_t*      blocksize;
	struct fla_apqudut_s* sub_apqudut;
	struct fla_gemm_s*    sub_gemm1;
	struct fla_gemm_s*    sub_gemm2;
	struct fla_gemm_s*    sub_gemm3;
	struct fla_gemm_s*    sub_gemm4;
	struct fla_trsm_s*    sub_trsm;
	struct fla_copyt_s*   sub_copyt;
	struct fla_axpyt_s*   sub_axpyt;
};
typedef struct fla_apqudut_s fla_apqudut_t;


struct fla_uddateutinc_s
{
	FLA_Matrix_type        matrix_type;
	int                    variant;
	fla_blocksize_t*       blocksize;
	struct fla_uddateut_s* sub_uddateut;
	struct fla_apqudut_s*  sub_apqudut;
};
typedef struct fla_uddateutinc_s fla_uddateutinc_t;


struct fla_apqudutinc_s
{
	FLA_Matrix_type        matrix_type;
	int                    variant;
	fla_blocksize_t*       blocksize;
	struct fla_apqudut_s*  sub_apqudut;
};
typedef struct fla_apqudutinc_s fla_apqudutinc_t;


struct fla_eig_gest_s
{
	FLA_Matrix_type        matrix_type;
	int                    variant;
	fla_blocksize_t*       blocksize;
	struct fla_eig_gest_s* sub_eig_gest;
	struct fla_axpy_s*     sub_axpy1;
	struct fla_axpy_s*     sub_axpy2;
	struct fla_gemm_s*     sub_gemm1;
	struct fla_gemm_s*     sub_gemm2;
	struct fla_gemm_s*     sub_gemm3;
	struct fla_hemm_s*     sub_hemm;
	struct fla_her2k_s*    sub_her2k;
	struct fla_trmm_s*     sub_trmm1;
	struct fla_trmm_s*     sub_trmm2;
	struct fla_trsm_s*     sub_trsm1;
	struct fla_trsm_s*     sub_trsm2;
};
typedef struct fla_eig_gest_s fla_eig_gest_t;


#define FLA_Cntl_sub_chol( cntl )      cntl->sub_chol
#define FLA_Cntl_sub_lu( cntl )        cntl->sub_lu
#define FLA_Cntl_sub_qr( cntl )        cntl->sub_qr
#define FLA_Cntl_sub_qrut( cntl )      cntl->sub_qrut
#define FLA_Cntl_sub_qr2ut( cntl )     cntl->sub_qr2ut
#define FLA_Cntl_sub_lq( cntl )        cntl->sub_lq
#define FLA_Cntl_sub_lqut( cntl )      cntl->sub_lqut
#define FLA_Cntl_sub_caqr2ut( cntl )   cntl->sub_caqr2ut
#define FLA_Cntl_sub_trinv( cntl )     cntl->sub_trinv
#define FLA_Cntl_sub_ttmm( cntl )      cntl->sub_ttmm
#define FLA_Cntl_sub_sylv( cntl )      cntl->sub_sylv
#define FLA_Cntl_sub_sylv1( cntl )     cntl->sub_sylv1
#define FLA_Cntl_sub_sylv2( cntl )     cntl->sub_sylv2
#define FLA_Cntl_sub_sylv3( cntl )     cntl->sub_sylv3
#define FLA_Cntl_sub_lyap( cntl )      cntl->sub_lyap
#define FLA_Cntl_sub_appiv( cntl )     cntl->sub_appiv
#define FLA_Cntl_sub_appiv1( cntl )    cntl->sub_appiv1
#define FLA_Cntl_sub_appiv2( cntl )    cntl->sub_appiv2
#define FLA_Cntl_sub_apqut( cntl )     cntl->sub_apqut
#define FLA_Cntl_sub_apq2ut( cntl )    cntl->sub_apq2ut
#define FLA_Cntl_sub_apcaq2ut( cntl )  cntl->sub_apcaq2ut
#define FLA_Cntl_sub_uddateut( cntl )  cntl->sub_uddateut
#define FLA_Cntl_sub_apqudut( cntl )   cntl->sub_apqudut
#define FLA_Cntl_sub_hessut( cntl )    cntl->sub_hessut
#define FLA_Cntl_sub_tridiagut( cntl ) cntl->sub_tridiagut
#define FLA_Cntl_sub_bidiagut( cntl )  cntl->sub_bidiagut
#define FLA_Cntl_sub_eig_gest( cntl )  cntl->sub_eig_gest


fla_chol_t* FLA_Cntl_chol_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_chol_t*      sub_chol,
                                      fla_herk_t*      sub_herk,
                                      fla_trsm_t*      sub_trsm,
                                      fla_gemm_t*      sub_gemm );
fla_lu_t* FLA_Cntl_lu_obj_create( FLA_Matrix_type  matrix_type,
                                  int              variant,
                                  fla_blocksize_t* blocksize,
                                  fla_lu_t*        sub_lu,
                                  fla_gemm_t*      sub_gemm1,
                                  fla_gemm_t*      sub_gemm2,
                                  fla_gemm_t*      sub_gemm3,
                                  fla_trsm_t*      sub_trsm1,
                                  fla_trsm_t*      sub_trsm2,
                                  fla_appiv_t*     sub_appiv1,
                                  fla_appiv_t*     sub_appiv2 );
fla_appiv_t* FLA_Cntl_appiv_obj_create( FLA_Matrix_type  matrix_type,
                                        int              variant,
                                        fla_blocksize_t* blocksize,
                                        fla_appiv_t*     sub_appiv );
fla_qrut_t* FLA_Cntl_qrut_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_qrut_t*      sub_qrut,
                                      fla_apqut_t*     sub_apqut );
fla_qr2ut_t* FLA_Cntl_qr2ut_obj_create( FLA_Matrix_type  matrix_type,
                                        int              variant,
                                        fla_blocksize_t* blocksize,
                                        fla_qr2ut_t*     sub_qr2ut,
                                        fla_gemm_t*      sub_gemm1,
                                        fla_gemm_t*      sub_gemm2,
                                        fla_trsm_t*      sub_trsm,
                                        fla_copy_t*      sub_copy,
                                        fla_axpy_t*      sub_axpy );
fla_lqut_t* FLA_Cntl_lqut_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_lqut_t*      sub_lqut,
                                      fla_apqut_t*     sub_apqut );
fla_caqr2ut_t* FLA_Cntl_caqr2ut_obj_create( FLA_Matrix_type  matrix_type,
                                            int              variant,
                                            fla_blocksize_t* blocksize,
                                            fla_caqr2ut_t*   sub_caqr2ut,
                                            fla_gemm_t*      sub_gemm1,
                                            fla_gemm_t*      sub_gemm2,
                                            fla_trmm_t*      sub_trmm1,
                                            fla_trmm_t*      sub_trmm2,
                                            fla_trsm_t*      sub_trsm,
                                            fla_axpy_t*      sub_axpy1,
                                            fla_axpy_t*      sub_axpy2,
                                            fla_axpy_t*      sub_axpy3,
                                            fla_copy_t*      sub_copy );
fla_hessut_t* FLA_Cntl_hessut_obj_create( FLA_Matrix_type  matrix_type,
                                          int              variant,
                                          fla_blocksize_t* blocksize );
fla_tridiagut_t* FLA_Cntl_tridiagut_obj_create( FLA_Matrix_type  matrix_type,
                                                int              variant,
                                                fla_blocksize_t* blocksize );
fla_bidiagut_t* FLA_Cntl_bidiagut_obj_create( FLA_Matrix_type  matrix_type,
                                              int              variant,
                                              fla_blocksize_t* blocksize );
fla_trinv_t* FLA_Cntl_trinv_obj_create( FLA_Matrix_type  matrix_type,
                                        int              variant,
                                        fla_blocksize_t* blocksize,
                                        fla_trinv_t*     sub_trinv,
                                        fla_trmm_t*      sub_trmm,
                                        fla_trsm_t*      sub_trsm1,
                                        fla_trsm_t*      sub_trsm2,
                                        fla_gemm_t*      sub_gemm );
fla_ttmm_t* FLA_Cntl_ttmm_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_ttmm_t*      sub_ttmm,
                                      fla_herk_t*      sub_herk,
                                      fla_trmm_t*      sub_trmm,
                                      fla_gemm_t*      sub_gemm );
fla_sylv_t* FLA_Cntl_sylv_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_sylv_t*      sub_sylv1,
                                      fla_sylv_t*      sub_sylv2,
                                      fla_sylv_t*      sub_sylv3,
                                      fla_gemm_t*      sub_gemm1,
                                      fla_gemm_t*      sub_gemm2,
                                      fla_gemm_t*      sub_gemm3,
                                      fla_gemm_t*      sub_gemm4,
                                      fla_gemm_t*      sub_gemm5,
                                      fla_gemm_t*      sub_gemm6,
                                      fla_gemm_t*      sub_gemm7,
                                      fla_gemm_t*      sub_gemm8 );
fla_lyap_t* FLA_Cntl_lyap_obj_create( FLA_Matrix_type  matrix_type,
                                      int              variant,
                                      fla_blocksize_t* blocksize,
                                      fla_scal_t*      sub_scal,
                                      fla_lyap_t*      sub_lyap,
                                      fla_sylv_t*      sub_sylv,
                                      fla_gemm_t*      sub_gemm1,
                                      fla_gemm_t*      sub_gemm2,
                                      fla_hemm_t*      sub_hemm,
                                      fla_her2k_t*     sub_her2k );
fla_spdinv_t* FLA_Cntl_spdinv_obj_create( FLA_Matrix_type  matrix_type,
                                          int              variant,
                                          fla_blocksize_t* blocksize,
                                          fla_chol_t*      sub_chol,
                                          fla_trinv_t*     sub_trinv,
                                          fla_ttmm_t*      sub_ttmm );
fla_apqut_t* FLA_Cntl_apqut_obj_create( FLA_Matrix_type  matrix_type,
                                        int              variant,
                                        fla_blocksize_t* blocksize,
                                        fla_apqut_t*     sub_apqut,
                                        fla_trmm_t*      sub_trmm1,
                                        fla_trmm_t*      sub_trmm2,
                                        fla_gemm_t*      sub_gemm1,
                                        fla_gemm_t*      sub_gemm2,
                                        fla_trsm_t*      sub_trsm,
                                        fla_copyt_t*     sub_copyt,
                                        fla_axpyt_t*     sub_axpyt );
fla_apq2ut_t* FLA_Cntl_apq2ut_obj_create( FLA_Matrix_type  matrix_type,
                                          int              variant,
                                          fla_blocksize_t* blocksize,
                                          fla_apq2ut_t*    sub_apq2ut,
                                          fla_gemm_t*      sub_gemm1,
                                          fla_gemm_t*      sub_gemm2,
                                          fla_trsm_t*      sub_trsm,
                                          fla_copyt_t*     sub_copyt,
                                          fla_axpyt_t*     sub_axpyt );
fla_apcaq2ut_t* FLA_Cntl_apcaq2ut_obj_create( FLA_Matrix_type  matrix_type,
                                              int              variant,
                                              fla_blocksize_t* blocksize,
                                              fla_apcaq2ut_t*  sub_apcaq2ut,
                                              fla_gemm_t*      sub_gemm1,
                                              fla_gemm_t*      sub_gemm2,
                                              fla_trmm_t*      sub_trmm1,
                                              fla_trmm_t*      sub_trmm2,
                                              fla_trsm_t*      sub_trsm,
                                              fla_axpy_t*      sub_axpy1,
                                              fla_axpy_t*      sub_axpy2,
                                              fla_axpy_t*      sub_axpy3,
                                              fla_copy_t*      sub_copy );
fla_qrutinc_t* FLA_Cntl_qrutinc_obj_create( FLA_Matrix_type  matrix_type,
                                            int              variant,
                                            fla_blocksize_t* blocksize,
                                            fla_qrut_t*      sub_qrut,
                                            fla_apqut_t*     sub_apqut,
                                            fla_qr2ut_t*     sub_qr2ut,
                                            fla_apq2ut_t*    sub_apq2ut );
fla_apqutinc_t* FLA_Cntl_apqutinc_obj_create( FLA_Matrix_type  matrix_type,
                                              int              variant,
                                              fla_blocksize_t* blocksize,
                                              fla_apqut_t*     sub_apqut,
                                              fla_apq2ut_t*    sub_apq2ut );
fla_caqrutinc_t* FLA_Cntl_caqrutinc_obj_create( FLA_Matrix_type  matrix_type,
                                                int              variant,
                                                fla_blocksize_t* blocksize,
                                                fla_caqr2ut_t*   sub_caqr2ut,
                                                fla_apcaq2ut_t*  sub_apcaq2ut );
fla_apcaqutinc_t* FLA_Cntl_apcaqutinc_obj_create( FLA_Matrix_type  matrix_type,
                                                  int              variant,
                                                  fla_blocksize_t* blocksize,
                                                  fla_apcaq2ut_t*  sub_apcaq2ut );
fla_uddateut_t* FLA_Cntl_uddateut_obj_create( FLA_Matrix_type  matrix_type,
                                              int              variant,
                                              fla_blocksize_t* blocksize,
                                              fla_uddateut_t*  sub_uddateut,
                                              fla_apqudut_t*   sub_apqudut );
fla_apqudut_t* FLA_Cntl_apqudut_obj_create( FLA_Matrix_type  matrix_type,
                                            int              variant,
                                            fla_blocksize_t* blocksize,
                                            fla_apqudut_t*   sub_apq2ut,
                                            fla_gemm_t*      sub_gemm1,
                                            fla_gemm_t*      sub_gemm2,
                                            fla_gemm_t*      sub_gemm3,
                                            fla_gemm_t*      sub_gemm4,
                                            fla_trsm_t*      sub_trsm,
                                            fla_copyt_t*     sub_copyt,
                                            fla_axpyt_t*     sub_axpyt );
fla_uddateutinc_t* FLA_Cntl_uddateutinc_obj_create( FLA_Matrix_type  matrix_type,
                                                    int              variant,
                                                    fla_blocksize_t* blocksize,
                                                    fla_uddateut_t*  sub_uddateut,
                                                    fla_apqudut_t*   sub_apqudut );
fla_apqudutinc_t* FLA_Cntl_apqudutinc_obj_create( FLA_Matrix_type  matrix_type,
                                                  int              variant,
                                                  fla_blocksize_t* blocksize,
                                                  fla_apqudut_t*   sub_apqudut );
fla_eig_gest_t* FLA_Cntl_eig_gest_obj_create( FLA_Matrix_type  matrix_type,
                                              int              variant,
                                              fla_blocksize_t* blocksize,
                                              fla_eig_gest_t*  sub_eig_gest,
                                              fla_axpy_t*      sub_axpy1,
                                              fla_axpy_t*      sub_axpy2,
                                              fla_gemm_t*      sub_gemm1,
                                              fla_gemm_t*      sub_gemm2,
                                              fla_gemm_t*      sub_gemm3,
                                              fla_hemm_t*      sub_hemm,
                                              fla_her2k_t*     sub_her2k,
                                              fla_trmm_t*      sub_trmm1,
                                              fla_trmm_t*      sub_trmm2,
                                              fla_trsm_t*      sub_trsm1,
                                              fla_trsm_t*      sub_trsm2 );

// end FLA_Cntl_lapack.h

// end FLA_Cntl.h
// begin FLA_Cntl_init.h


// begin FLA_Cntl_init_flamec.h


void FLA_Cntl_init_flamec( void );
void FLA_Cntl_finalize_flamec( void );


// --- Base library prototypes -------------------------------------------------
void FLA_Transpose_cntl_init( void );

void FLA_Transpose_cntl_finalize( void );


// --- Level-1 BLAS prototypes -------------------------------------------------
void FLA_Axpy_cntl_init( void );
void FLA_Axpyt_cntl_init( void );
void FLA_Copy_cntl_init( void );
void FLA_Copyt_cntl_init( void );
void FLA_Copyr_cntl_init( void );
void FLA_Scal_cntl_init( void );
void FLA_Scalr_cntl_init( void );

void FLA_Axpy_cntl_finalize( void );
void FLA_Axpyt_cntl_finalize( void );
void FLA_Copy_cntl_finalize( void );
void FLA_Copyt_cntl_finalize( void );
void FLA_Copyr_cntl_finalize( void );
void FLA_Scal_cntl_finalize( void );
void FLA_Scalr_cntl_finalize( void );


// --- Level-2 BLAS prototypes -------------------------------------------------
void FLA_Gemv_cntl_init( void );
void FLA_Trsv_cntl_init( void );

void FLA_Gemv_cntl_finalize( void );
void FLA_Trsv_cntl_finalize( void );


// --- Level-3 BLAS prototypes -------------------------------------------------
void FLA_Gemm_cntl_init( void );
void FLA_Hemm_cntl_init( void );
void FLA_Herk_cntl_init( void );
void FLA_Her2k_cntl_init( void );
void FLA_Symm_cntl_init( void );
void FLA_Syrk_cntl_init( void );
void FLA_Syr2k_cntl_init( void );
void FLA_Trmm_cntl_init( void );
void FLA_Trsm_cntl_init( void );

void FLA_Gemm_cntl_finalize( void );
void FLA_Hemm_cntl_finalize( void );
void FLA_Herk_cntl_finalize( void );
void FLA_Her2k_cntl_finalize( void );
void FLA_Symm_cntl_finalize( void );
void FLA_Syrk_cntl_finalize( void );
void FLA_Syr2k_cntl_finalize( void );
void FLA_Trmm_cntl_finalize( void );
void FLA_Trsm_cntl_finalize( void );


// --- LAPACK-level prototypes -------------------------------------------------
void FLA_Apply_pivots_cntl_init( void );
void FLA_Chol_cntl_init( void );
void FLA_LU_piv_cntl_init( void );
void FLA_LU_nopiv_cntl_init( void );
void FLA_QR_UT_cntl_init( void );
void FLA_QR2_UT_cntl_init( void );
void FLA_LQ_UT_cntl_init( void );
void FLA_CAQR2_UT_cntl_init( void );
void FLA_UDdate_UT_cntl_init( void );
void FLA_Hess_UT_cntl_init( void );
void FLA_Tridiag_UT_cntl_init( void );
void FLA_Bidiag_UT_cntl_init( void );
void FLA_Trinv_cntl_init( void );
void FLA_Ttmm_cntl_init( void );
void FLA_Sylv_cntl_init( void );
void FLA_Lyap_cntl_init( void );
void FLA_SPDinv_cntl_init( void );
void FLA_Apply_Q_UT_cntl_init( void );
void FLA_Apply_Q2_UT_cntl_init( void );
void FLA_Apply_CAQ2_UT_cntl_init( void );
void FLA_Apply_QUD_UT_cntl_init( void );
void FLA_Eig_gest_cntl_init( void );

void FLA_Apply_pivots_cntl_finalize( void );
void FLA_Chol_cntl_finalize( void );
void FLA_LU_piv_cntl_finalize( void );
void FLA_LU_nopiv_cntl_finalize( void );
void FLA_QR_UT_cntl_finalize( void );
void FLA_QR2_UT_cntl_finalize( void );
void FLA_LQ_UT_cntl_finalize( void );
void FLA_CAQR2_UT_cntl_finalize( void );
void FLA_UDdate_UT_cntl_finalize( void );
void FLA_Hess_UT_cntl_finalize( void );
void FLA_Tridiag_UT_cntl_finalize( void );
void FLA_Bidiag_UT_cntl_finalize( void );
void FLA_Trinv_cntl_finalize( void );
void FLA_Ttmm_cntl_finalize( void );
void FLA_Sylv_cntl_finalize( void );
void FLA_Lyap_cntl_finalize( void );
void FLA_SPDinv_cntl_finalize( void );
void FLA_Apply_Q_UT_cntl_finalize( void );
void FLA_Apply_Q2_UT_cntl_finalize( void );
void FLA_Apply_CAQ2_UT_cntl_finalize( void );
void FLA_Apply_QUD_UT_cntl_finalize( void );
void FLA_Eig_gest_cntl_finalize( void );

// end FLA_Cntl_init_flamec.h
// begin FLA_Cntl_init_flash.h


void FLA_Cntl_init_flash( void );
void FLA_Cntl_finalize_flash( void );


// --- Base library prototypes -------------------------------------------------
void FLASH_Transpose_cntl_init( void );

void FLASH_Transpose_cntl_finalize( void );


// --- Level-1 BLAS prototypes -------------------------------------------------
void FLASH_Axpy_cntl_init( void );
void FLASH_Axpyt_cntl_init( void );
void FLASH_Copy_cntl_init( void );
void FLASH_Copyt_cntl_init( void );
void FLASH_Copyr_cntl_init( void );
void FLASH_Scal_cntl_init( void );
void FLASH_Scalr_cntl_init( void );

void FLASH_Axpy_cntl_finalize( void );
void FLASH_Axpyt_cntl_finalize( void );
void FLASH_Copy_cntl_finalize( void );
void FLASH_Copyt_cntl_finalize( void );
void FLASH_Copyr_cntl_finalize( void );
void FLASH_Scal_cntl_finalize( void );
void FLASH_Scalr_cntl_finalize( void );


// --- Level-2 BLAS prototypes -------------------------------------------------
void FLASH_Gemv_cntl_init( void );
void FLASH_Trsv_cntl_init( void );

void FLASH_Gemv_cntl_finalize( void );
void FLASH_Trsv_cntl_finalize( void );


// --- Level-3 BLAS prototypes -------------------------------------------------
void FLASH_Gemm_cntl_init( void );
void FLASH_Hemm_cntl_init( void );
void FLASH_Herk_cntl_init( void );
void FLASH_Her2k_cntl_init( void );
void FLASH_Symm_cntl_init( void );
void FLASH_Syrk_cntl_init( void );
void FLASH_Syr2k_cntl_init( void );
void FLASH_Trmm_cntl_init( void );
void FLASH_Trsm_cntl_init( void );

void FLASH_Gemm_cntl_finalize( void );
void FLASH_Hemm_cntl_finalize( void );
void FLASH_Herk_cntl_finalize( void );
void FLASH_Her2k_cntl_finalize( void );
void FLASH_Symm_cntl_finalize( void );
void FLASH_Syrk_cntl_finalize( void );
void FLASH_Syr2k_cntl_finalize( void );
void FLASH_Trmm_cntl_finalize( void );
void FLASH_Trsm_cntl_finalize( void );


// --- LAPACK-level prototypes -------------------------------------------------
void FLASH_Apply_pivots_cntl_init( void );
void FLASH_Chol_cntl_init( void );
void FLASH_LU_nopiv_cntl_init( void );
void FLASH_LU_piv_cntl_init( void );
void FLASH_LU_incpiv_cntl_init( void );
void FLASH_Trinv_cntl_init( void );
void FLASH_Ttmm_cntl_init( void );
void FLASH_SPDinv_cntl_init( void );
void FLASH_Sylv_cntl_init( void );
void FLASH_Lyap_cntl_init( void );
void FLASH_QR_UT_cntl_init( void );
void FLASH_QR2_UT_cntl_init( void );
void FLASH_LQ_UT_cntl_init( void );
void FLASH_CAQR2_UT_cntl_init( void );
void FLASH_UDdate_UT_cntl_init( void );
void FLASH_QR_UT_inc_cntl_init( void );
void FLASH_CAQR_UT_inc_cntl_init( void );
void FLASH_UDdate_UT_inc_cntl_init( void );
void FLASH_Apply_Q_UT_cntl_init( void );
void FLASH_Apply_Q2_UT_cntl_init( void );
void FLASH_Apply_CAQ2_UT_cntl_init( void );
void FLASH_Apply_QUD_UT_cntl_init( void );
void FLASH_Apply_Q_UT_inc_cntl_init( void );
void FLASH_Apply_CAQ_UT_inc_cntl_init( void );
void FLASH_Apply_QUD_UT_inc_cntl_init( void );
void FLASH_Eig_gest_cntl_init( void );

void FLASH_Apply_pivots_cntl_finalize( void );
void FLASH_Chol_cntl_finalize( void );
void FLASH_LU_nopiv_cntl_finalize( void );
void FLASH_LU_piv_cntl_finalize( void );
void FLASH_LU_incpiv_cntl_finalize( void );
void FLASH_Trinv_cntl_finalize( void );
void FLASH_Ttmm_cntl_finalize( void );
void FLASH_SPDinv_cntl_finalize( void );
void FLASH_Sylv_cntl_finalize( void );
void FLASH_Lyap_cntl_finalize( void );
void FLASH_QR_UT_cntl_finalize( void );
void FLASH_QR2_UT_cntl_finalize( void );
void FLASH_LQ_UT_cntl_finalize( void );
void FLASH_CAQR2_UT_cntl_finalize( void );
void FLASH_UDdate_UT_cntl_finalize( void );
void FLASH_QR_UT_inc_cntl_finalize( void );
void FLASH_CAQR_UT_inc_cntl_finalize( void );
void FLASH_UDdate_UT_inc_cntl_finalize( void );
void FLASH_Apply_Q_UT_cntl_finalize( void );
void FLASH_Apply_Q2_UT_cntl_finalize( void );
void FLASH_Apply_CAQ2_UT_cntl_finalize( void );
void FLASH_Apply_QUD_UT_cntl_finalize( void );
void FLASH_Apply_Q_UT_inc_cntl_finalize( void );
void FLASH_Apply_CAQ_UT_inc_cntl_finalize( void );
void FLASH_Apply_QUD_UT_inc_cntl_finalize( void );
void FLASH_Eig_gest_cntl_finalize( void );

// end FLA_Cntl_init_flash.h

void FLA_Cntl_init( void );
void FLA_Cntl_finalize( void );

// end FLA_Cntl_init.h

  // Include prototypes for base FLAME routines.
// begin FLA_main_prototypes.h


#ifdef FLA_ENABLE_HIP
#include <rocblas/rocblas.h> // skipped
#include <rocsolver/rocsolver.h> // skipped
#endif

// -----------------------------------------------------------------------------

fla_blocksize_t* FLA_Blocksize_create( dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z );
fla_blocksize_t* FLA_Blocksize_create_copy( fla_blocksize_t* bp );
void             FLA_Blocksize_set( fla_blocksize_t* bp, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z );
void             FLA_Blocksize_scale( fla_blocksize_t* bp, double factor );
void             FLA_Blocksize_free( fla_blocksize_t* bp );
dim_t            FLA_Blocksize_extract( FLA_Datatype dt, fla_blocksize_t* bp );

fla_blocksize_t* FLA_Query_blocksizes( FLA_Dimension dim );
dim_t            FLA_Query_blocksize( FLA_Datatype dt, FLA_Dimension dim );

dim_t            FLA_Determine_blocksize( FLA_Obj A_unproc, FLA_Quadrant to_dir, fla_blocksize_t* cntl_blocksizes );
dim_t            FLA_determine_matrix_size( FLA_Obj A_unproc, FLA_Quadrant to_dir );



// -----------------------------------------------------------------------------

unsigned int  FLA_Check_error_level( void );
unsigned int  FLA_Check_error_level_set( unsigned int level );
FLA_Error     FLA_Check_error_code_helper( int code, char* file, int line );
FLA_Error     FLA_Check_valid_side( FLA_Side side );
FLA_Error     FLA_Check_valid_uplo( FLA_Uplo uplo );
FLA_Error     FLA_Check_valid_trans( FLA_Trans trans );
FLA_Error     FLA_Check_valid_diag( FLA_Diag diag );
FLA_Error     FLA_Check_valid_conj( FLA_Conj conj );
FLA_Error     FLA_Check_valid_direct( FLA_Conj direct );
FLA_Error     FLA_Check_valid_storev( FLA_Conj storev );
FLA_Error     FLA_Check_valid_inverse( FLA_Inv inv );
FLA_Error     FLA_Check_valid_datatype( FLA_Datatype datatype );
FLA_Error     FLA_Check_valid_object_datatype( FLA_Obj A );
FLA_Error     FLA_Check_valid_evd_type( FLA_Evd_type evd_type );
FLA_Error     FLA_Check_valid_svd_type( FLA_Svd_type svd_type );
FLA_Error     FLA_Check_valid_svd_type_combination( FLA_Svd_type svd_type_u, FLA_Svd_type svd_type_v );
FLA_Error     FLA_Check_valid_svd_type_and_trans_combination( FLA_Svd_type svd_type_u, FLA_Trans transu,
                                                              FLA_Svd_type svd_type_v, FLA_Trans transv );
FLA_Error     FLA_Check_floating_datatype( FLA_Datatype datatype );
FLA_Error     FLA_Check_int_datatype( FLA_Datatype datatype );
FLA_Error     FLA_Check_real_datatype( FLA_Datatype datatype );
FLA_Error     FLA_Check_complex_datatype( FLA_Datatype datatype );
FLA_Error     FLA_Check_floating_object( FLA_Obj A );
FLA_Error     FLA_Check_int_object( FLA_Obj A );
FLA_Error     FLA_Check_real_object( FLA_Obj A );
FLA_Error     FLA_Check_comparable_object( FLA_Obj A );
FLA_Error     FLA_Check_complex_object( FLA_Obj A );
FLA_Error     FLA_Check_consistent_datatype( FLA_Datatype datatype, FLA_Obj A );
FLA_Error     FLA_Check_consistent_object_datatype( FLA_Obj A, FLA_Obj B );
FLA_Error     FLA_Check_identical_object_precision( FLA_Obj A, FLA_Obj B );
FLA_Error     FLA_Check_square( FLA_Obj A );
FLA_Error     FLA_Check_if_scalar( FLA_Obj A );
FLA_Error     FLA_Check_if_vector( FLA_Obj A );
FLA_Error     FLA_Check_conformal_dims( FLA_Trans trans, FLA_Obj A, FLA_Obj B );
FLA_Error     FLA_Check_matrix_matrix_dims( FLA_Trans transa, FLA_Trans transb, FLA_Obj A, FLA_Obj B, FLA_Obj C );
FLA_Error     FLA_Check_matrix_vector_dims( FLA_Trans trans, FLA_Obj A, FLA_Obj x, FLA_Obj y );
FLA_Error     FLA_Check_equal_vector_dims( FLA_Obj x, FLA_Obj y );
FLA_Error     FLA_Check_conj1_trans_and_datatype( FLA_Trans trans, FLA_Obj A );
FLA_Error     FLA_Check_hess_indices( FLA_Obj A, int ilo, int ihi );
FLA_Error     FLA_Check_null_pointer( void* ptr );
FLA_Error     FLA_Check_object_dims( FLA_Trans trans, dim_t m, dim_t n, FLA_Obj A );
FLA_Error     FLA_Check_valid_pivot_type( FLA_Pivot_type ptype );
FLA_Error     FLA_Check_malloc_pointer( void* ptr );
FLA_Error     FLA_Check_base_buffer_mismatch( FLA_Obj A, FLA_Obj B );
FLA_Error     FLA_Check_adjacent_objects_2x2( FLA_Obj A11, FLA_Obj A12,
                                              FLA_Obj A21, FLA_Obj A22 );
FLA_Error     FLA_Check_adjacent_objects_2x1( FLA_Obj AT,
                                              FLA_Obj AB );
FLA_Error     FLA_Check_adjacent_objects_1x2( FLA_Obj AL, FLA_Obj AR );
FLA_Error     FLA_Check_blocksize_value( dim_t b );
FLA_Error     FLA_Check_blocksize_object( FLA_Datatype datatype, fla_blocksize_t* bp );
FLA_Error     FLA_Check_file_descriptor( int fd );
FLA_Error     FLA_Check_lseek_result( int requested_offset, int lseek_r_val );
FLA_Error     FLA_Check_close_result( int close_r_val );
FLA_Error     FLA_Check_unlink_result( int unlink_r_val );
FLA_Error     FLA_Check_read_result( int requested_size, int read_r_val );
FLA_Error     FLA_Check_write_result( int requested_size, int write_r_val );
FLA_Error     FLA_Check_valid_quadrant( FLA_Quadrant quad );
FLA_Error     FLA_Check_vector_dim_min( FLA_Obj x, dim_t min_dim );
FLA_Error     FLA_Check_pthread_create_result( int pthread_create_r_val );
FLA_Error     FLA_Check_pthread_join_result( int pthread_join_r_val );
FLA_Error     FLA_Check_valid_isgn_value( FLA_Obj isgn );
FLA_Error     FLA_Check_sylv_matrix_dims( FLA_Obj A, FLA_Obj B, FLA_Obj C );
FLA_Error     FLA_Check_chol_failure( FLA_Error r_val );
FLA_Error     FLA_Check_valid_elemtype( FLA_Elemtype elemtype );
FLA_Error     FLA_Check_posix_memalign_failure( int r_val );
FLA_Error     FLA_Check_submatrix_dims_and_offset( dim_t m, dim_t n, dim_t i, dim_t j, FLA_Obj A );
FLA_Error     FLA_Check_object_scalar_elemtype( FLA_Obj A );
FLA_Error     FLA_Check_object_matrix_elemtype( FLA_Obj A );
FLA_Error     FLA_Check_num_threads( unsigned int n_threads );
FLA_Error     FLA_Check_conj_and_datatype( FLA_Conj conj, FLA_Obj A );
FLA_Error     FLA_Check_valid_complex_trans( FLA_Trans trans );
FLA_Error     FLA_Check_valid_real_trans( FLA_Trans trans );
FLA_Error     FLA_Check_valid_blas_trans( FLA_Trans trans );
FLA_Error     FLA_Check_nonconstant_datatype( FLA_Datatype datatype );
FLA_Error     FLA_Check_nonconstant_object( FLA_Obj A );
FLA_Error     FLA_Check_identical_object_datatype( FLA_Obj A, FLA_Obj B );
FLA_Error     FLA_Check_divide_by_zero( FLA_Obj alpha );
FLA_Error     FLA_Check_identical_object_elemtype( FLA_Obj A, FLA_Obj B );
FLA_Error     FLA_Check_pivot_index_range( FLA_Obj p, dim_t k1, dim_t k2 );
FLA_Error     FLA_Check_householder_panel_dims( FLA_Obj A, FLA_Obj T );
FLA_Error     FLA_Check_object_length_equals( FLA_Obj A, dim_t m );
FLA_Error     FLA_Check_object_width_equals( FLA_Obj A, dim_t n );
FLA_Error     FLA_Check_object_length_min( FLA_Obj A, dim_t m );
FLA_Error     FLA_Check_object_width_min( FLA_Obj A, dim_t n );
FLA_Error     FLA_Check_valid_error_level( unsigned int level );
FLA_Error     FLA_Check_attempted_repart_2x2( FLA_Obj A_quad, dim_t b_m, dim_t b_n );
FLA_Error     FLA_Check_attempted_repart_2x1( FLA_Obj A_side, dim_t b_m );
FLA_Error     FLA_Check_attempted_repart_1x2( FLA_Obj A_side, dim_t b_n );
FLA_Error     FLA_Check_valid_leftright_side( FLA_Side side );
FLA_Error     FLA_Check_valid_topbottom_side( FLA_Side side );
FLA_Error     FLA_Check_matrix_strides( dim_t m, dim_t n, dim_t rs, dim_t cs );
FLA_Error     FLA_Check_vector_dim( FLA_Obj x, dim_t expected_length );
FLA_Error     FLA_Check_row_vector( FLA_Obj x );
FLA_Error     FLA_Check_col_vector( FLA_Obj x );
FLA_Error     FLA_Check_valid_machval( FLA_Machval val );
FLA_Error     FLA_Check_valid_evd_type( FLA_Evd_type evd_type );
FLA_Error     FLA_Check_valid_svd_type( FLA_Svd_type svd_type );
FLA_Error     FLA_Check_valid_diag_offset( FLA_Obj A, FLA_Diag_off offset );
FLA_Error     FLA_Check_col_storage( FLA_Obj A );
FLA_Error     FLA_Check_row_storage( FLA_Obj A );




// -----------------------------------------------------------------------------

char*         FLA_Error_string_for_code( int code );
void          FLA_Error_messages_init( void );
void          FLA_Print_message( char *str, char *file, int line );
void          FLA_Abort( void );



// -----------------------------------------------------------------------------

void          FLA_Init( void );
void          FLA_Finalize( void );
FLA_Bool      FLA_Initialized( void );

void          FLA_Init_safe( FLA_Error* init_result );
void          FLA_Finalize_safe( FLA_Error init_result );

void          FLA_Init_constants( void );
void          FLA_Finalize_constants( void );

void          FLA_Init_numerical_constants( void );
void          FLA_Finalize_numerical_constants( void );



//------------------------------------------------------------------------------

void          FLA_Lock_init( FLA_Lock* fla_lock_ptr );
void          FLA_Lock_destroy( FLA_Lock* fla_lock_ptr );
void          FLA_Lock_acquire( FLA_Lock* fla_lock_ptr );
void          FLA_Lock_release( FLA_Lock* fla_lock_ptr );
void          FLA_RWLock_init( FLA_RWLock* fla_lock_ptr );
void          FLA_RWLock_destroy( FLA_RWLock* fla_lock_ptr );
void          FLA_RWLock_write_acquire( FLA_RWLock* fla_lock_ptr );
void          FLA_RWLock_read_acquire( FLA_RWLock* fla_lock_ptr );
void          FLA_RWLock_release( FLA_RWLock* fla_lock_ptr );


// -----------------------------------------------------------------------------

void          FLA_Memory_leak_counter_init( void );
void          FLA_Memory_leak_counter_finalize( void );
FLA_Bool      FLA_Memory_leak_counter_status( void );
FLA_Bool      FLA_Memory_leak_counter_set( FLA_Bool new_status );

void*         FLA_malloc( size_t size );
void*         FLA_realloc( void* old_ptr, size_t size );
void*         FLA_buff_malloc( size_t size );
void          FLA_free( void *ptr );
void          FLA_buff_free( void *ptr );
 


// -----------------------------------------------------------------------------

FLA_Error     FLA_Obj_copy_view( FLA_Obj A, FLA_Obj* B );
void          FLA_Obj_extract_real_scalar( FLA_Obj alpha, double* alpha_value );
void          FLA_Obj_extract_complex_scalar( FLA_Obj alpha, dcomplex* alpha_value );
void          FLA_Obj_extract_real_part( FLA_Obj alpha, FLA_Obj beta );
void          FLA_Obj_extract_imag_part( FLA_Obj alpha, FLA_Obj beta );
void          FLA_Obj_set_real_part( FLA_Obj alpha, FLA_Obj beta );
void          FLA_Obj_set_imag_part( FLA_Obj alpha, FLA_Obj beta );
FLA_Error     FLA_Obj_show( char *s1, FLA_Obj A, char *format, char *s2 );
FLA_Error     FLA_Obj_fshow( FILE* file, char *s1, FLA_Obj A, char *format, char *s2 );

FLA_Error     FLA_Obj_copy_view_check( FLA_Obj A, FLA_Obj* B );
FLA_Error     FLA_Obj_extract_real_scalar_check( FLA_Obj alpha, double* alpha_value );
FLA_Error     FLA_Obj_extract_complex_scalar_check( FLA_Obj alpha, dcomplex* alpha_value );
FLA_Error     FLA_Obj_extract_real_part_check( FLA_Obj alpha, FLA_Obj beta );
FLA_Error     FLA_Obj_extract_imag_part_check( FLA_Obj alpha, FLA_Obj beta );
FLA_Error     FLA_Obj_set_real_part_check( FLA_Obj alpha, FLA_Obj beta );
FLA_Error     FLA_Obj_set_imag_part_check( FLA_Obj alpha, FLA_Obj beta );
FLA_Error     FLA_Obj_show_check( char* s1, FLA_Obj obj, char* format, char* s2 );
FLA_Error     FLA_Obj_fshow_check( FILE* file, char* s1, FLA_Obj obj, char* format, char* s2 );


// -----------------------------------------------------------------------------

FLA_Error     FLA_Copy_buffer_to_object( FLA_Trans trans, dim_t m, dim_t n, void* buffer, dim_t rs, dim_t cs, dim_t i, dim_t j, FLA_Obj obj );
FLA_Error     FLA_Copy_object_to_buffer( FLA_Trans trans, dim_t i, dim_t j, FLA_Obj obj, dim_t m, dim_t n, void* buffer, dim_t rs, dim_t cs );
FLA_Error     FLA_Copy_buffer_to_object_check( FLA_Trans trans, dim_t m, dim_t n, void* buffer, dim_t rs, dim_t cs, dim_t i, dim_t j, FLA_Obj obj );
FLA_Error     FLA_Copy_object_to_buffer_check( FLA_Trans trans, dim_t i, dim_t j, FLA_Obj obj, dim_t m, dim_t n, void* buffer, dim_t rs, dim_t cs );



// -----------------------------------------------------------------------------

FLA_Error     FLA_Axpy_buffer_to_object( FLA_Trans trans, FLA_Obj alpha, dim_t m, dim_t n, void* buffer, dim_t rs, dim_t cs, dim_t i, dim_t j, FLA_Obj C );
FLA_Error     FLA_Axpy_object_to_buffer( FLA_Trans trans, FLA_Obj alpha, dim_t i, dim_t j, FLA_Obj C, dim_t m, dim_t n, void* buffer, dim_t rs, dim_t cs );

FLA_Error     FLA_Axpy_buffer_to_object_check( FLA_Trans trans, FLA_Obj alpha, dim_t m, dim_t n, void* buffer, dim_t rs, dim_t cs, dim_t i, dim_t j, FLA_Obj C );
FLA_Error     FLA_Axpy_object_to_buffer_check( FLA_Trans trans, FLA_Obj alpha, dim_t i, dim_t j, FLA_Obj C, dim_t m, dim_t n, void* buffer, dim_t rs, dim_t cs );



// -----------------------------------------------------------------------------

#ifdef FLA_ENABLE_SCC
void*         FLA_shmalloc( size_t size );
void          FLA_shfree( void* ptr );
FLA_Bool      FLA_is_owner( void );
#endif
FLA_Error     FLA_Obj_nullify( FLA_Obj *obj );
FLA_Error     FLA_Obj_create( FLA_Datatype datatype, dim_t m, dim_t n, dim_t rs, dim_t cs, FLA_Obj *obj );
FLA_Error     FLA_Obj_create_ext( FLA_Datatype datatype, FLA_Elemtype elemtype, dim_t m, dim_t n, dim_t m_inner, dim_t n_inner, dim_t rs, dim_t cs, FLA_Obj *obj );
FLA_Error     FLA_Obj_create_conf_to( FLA_Trans trans, FLA_Obj old, FLA_Obj *obj );
FLA_Error     FLA_Obj_create_copy_of( FLA_Trans trans, FLA_Obj old, FLA_Obj *obj );
FLA_Error     FLA_Obj_create_without_buffer( FLA_Datatype datatype, dim_t m, dim_t n, FLA_Obj *obj );
FLA_Error     FLA_Obj_create_constant( double const_real, FLA_Obj *obj );
FLA_Error     FLA_Obj_create_constant_ext( float const_s, double const_d, FLA_Obj *obj );
FLA_Error     FLA_Obj_create_complex_constant( double const_real, double const_imag, FLA_Obj *obj );
FLA_Error     FLA_Obj_attach_buffer( void *buffer, dim_t rs, dim_t cs, FLA_Obj *obj );
FLA_Error     FLA_Obj_create_buffer( dim_t rs, dim_t cs, FLA_Obj *obj );
FLA_Error     FLA_Obj_free( FLA_Obj *obj );
FLA_Error     FLA_Obj_free_without_buffer( FLA_Obj *obj );
FLA_Error     FLA_Obj_free_buffer( FLA_Obj *obj );
dim_t         FLA_align_ldim( dim_t ldim, dim_t elem_size );
dim_t         FLA_compute_num_elem( dim_t elem_size, dim_t m, dim_t n, dim_t* rs, dim_t* cs );
void          FLA_adjust_strides( dim_t m, dim_t n, dim_t* rs, dim_t* cs );

FLA_Error     FLA_Obj_flip_base( FLA_Obj *obj );
FLA_Error     FLA_Obj_flip_view( FLA_Obj *obj );

FLA_Error     FLA_Obj_create_ext_check( FLA_Datatype datatype, FLA_Elemtype elemtype, dim_t m, dim_t n, dim_t m_inner, dim_t n_inner, dim_t rs, dim_t cs, FLA_Obj *obj );
FLA_Error     FLA_Obj_create_conf_to_check( FLA_Trans trans, FLA_Obj obj_old, FLA_Obj *obj );
FLA_Error     FLA_Obj_create_without_buffer_check( FLA_Datatype datatype, dim_t m, dim_t n, FLA_Obj *obj );
FLA_Error     FLA_Obj_create_constant_check( double const_real, FLA_Obj *obj );
FLA_Error     FLA_Obj_create_constant_ext_check( float const_s, double const_d, FLA_Obj *obj );
FLA_Error     FLA_Obj_create_complex_constant_check( double const_real, double const_imag, FLA_Obj *obj );
FLA_Error     FLA_Obj_attach_buffer_check( void *buffer, dim_t rs, dim_t cs, FLA_Obj *obj );
FLA_Error     FLA_Obj_create_buffer_check( dim_t rs, dim_t cs, FLA_Obj *obj );
FLA_Error     FLA_Obj_free_check( FLA_Obj *obj );
FLA_Error     FLA_Obj_free_without_buffer_check( FLA_Obj *obj );
FLA_Error     FLA_Obj_free_buffer_check( FLA_Obj *obj );

FLA_Error     FLA_Obj_create_buffer_task( dim_t rs, dim_t cs, FLA_Obj obj, void* cntl );
FLA_Error     FLA_Obj_free_buffer_task( FLA_Obj obj, void* cntl );


// -----------------------------------------------------------------------------

FLA_Datatype  FLA_Obj_datatype( FLA_Obj obj );
FLA_Datatype  FLA_Obj_datatype_proj_to_real( FLA_Obj A );
FLA_Datatype  FLA_Obj_datatype_proj_to_complex( FLA_Obj A );
FLA_Elemtype  FLA_Obj_elemtype( FLA_Obj obj );
dim_t         FLA_Obj_datatype_size( FLA_Datatype datatype );
dim_t         FLA_Obj_elem_size( FLA_Obj obj );
dim_t         FLA_Obj_length( FLA_Obj obj );
dim_t         FLA_Obj_width( FLA_Obj obj );
FLA_Uplo      FLA_Obj_structure( FLA_Obj obj );
dim_t         FLA_Obj_vector_dim( FLA_Obj obj );
dim_t         FLA_Obj_vector_inc( FLA_Obj obj );
dim_t         FLA_Obj_min_dim( FLA_Obj obj );
dim_t         FLA_Obj_max_dim( FLA_Obj obj );
dim_t         FLA_Obj_row_stride( FLA_Obj obj );
dim_t         FLA_Obj_col_stride( FLA_Obj obj );
dim_t         FLA_Obj_row_offset( FLA_Obj obj );
dim_t         FLA_Obj_col_offset( FLA_Obj obj );
dim_t         FLA_Obj_base_length( FLA_Obj obj );
dim_t         FLA_Obj_base_width( FLA_Obj obj );
dim_t         FLA_Obj_num_elem_alloc( FLA_Obj obj );
void*         FLA_Obj_base_buffer( FLA_Obj obj );
void*         FLA_Obj_buffer_at_view( FLA_Obj obj );
#ifdef FLA_ENABLE_HIP
void*         FLA_Obj_hip_buffer_at_view( FLA_Obj obj, void* hip_buffer );
#endif
FLA_Bool      FLA_Obj_buffer_is_null( FLA_Obj obj );
FLA_Bool      FLA_Obj_is_int( FLA_Obj A );
FLA_Bool      FLA_Obj_is_floating_point( FLA_Obj A );
FLA_Bool      FLA_Obj_is_constant( FLA_Obj A );
FLA_Bool      FLA_Obj_is_real( FLA_Obj A );
FLA_Bool      FLA_Obj_is_complex( FLA_Obj A );
FLA_Bool      FLA_Obj_is_single_precision( FLA_Obj A );
FLA_Bool      FLA_Obj_is_double_precision( FLA_Obj A );
FLA_Bool      FLA_Obj_is_scalar( FLA_Obj A );
FLA_Bool      FLA_Obj_is_vector( FLA_Obj A );
FLA_Bool      FLA_Obj_has_zero_dim( FLA_Obj A );
FLA_Bool      FLA_Obj_is_row_major( FLA_Obj A );
FLA_Bool      FLA_Obj_is_col_major( FLA_Obj A );
FLA_Bool      FLA_Obj_is_conformal_to( FLA_Trans trans, FLA_Obj A, FLA_Obj B );
FLA_Bool      FLA_Obj_is( FLA_Obj A, FLA_Obj B );
FLA_Bool      FLA_Obj_is_identical( FLA_Obj A, FLA_Obj B );
FLA_Bool      FLA_Obj_is_overlapped( FLA_Obj A, FLA_Obj B );
FLA_Bool      FLA_Obj_equals( FLA_Obj A, FLA_Obj B );
FLA_Bool      FLA_Obj_gt( FLA_Obj A, FLA_Obj B );
FLA_Bool      FLA_Obj_ge( FLA_Obj A, FLA_Obj B );
FLA_Bool      FLA_Obj_lt( FLA_Obj A, FLA_Obj B );
FLA_Bool      FLA_Obj_le( FLA_Obj A, FLA_Obj B );
void*         FLA_Submatrix_at( FLA_Datatype datatype, void* buffer, dim_t i, dim_t j, dim_t rs, dim_t cs );
FLA_Bool      FLA_Obj_has_nan( FLA_Obj A );

FLA_Error     FLA_Obj_datatype_check( FLA_Obj obj );
FLA_Error     FLA_Obj_datatype_proj_to_real_check( FLA_Obj obj );
FLA_Error     FLA_Obj_elemtype_check( FLA_Obj obj );
FLA_Error     FLA_Obj_datatype_size_check( FLA_Datatype datatype );
FLA_Error     FLA_Obj_elem_size_check( FLA_Obj obj );
FLA_Error     FLA_Obj_buffer_at_view_check( FLA_Obj obj );
FLA_Error     FLA_Obj_equals_check( FLA_Obj A, FLA_Obj B );
FLA_Bool      FLA_Obj_gt_check( FLA_Obj A, FLA_Obj B );
FLA_Bool      FLA_Obj_ge_check( FLA_Obj A, FLA_Obj B );
FLA_Bool      FLA_Obj_lt_check( FLA_Obj A, FLA_Obj B );
FLA_Bool      FLA_Obj_le_check( FLA_Obj A, FLA_Obj B );
FLA_Error     FLA_Submatrix_at_check( FLA_Datatype datatype, void* buffer, dim_t i, dim_t j, dim_t rs, dim_t cs );
FLA_Error     FLA_Obj_has_nan_check( FLA_Obj A );


// -----------------------------------------------------------------------------

void          FLA_Param_map_flame_to_netlib_trans( FLA_Trans trans, void* blas_trans );
void          FLA_Param_map_flame_to_netlib_uplo( FLA_Uplo uplo, void* blas_uplo );
void          FLA_Param_map_flame_to_netlib_side( FLA_Uplo side, void* blas_side );
void          FLA_Param_map_flame_to_netlib_diag( FLA_Diag diag, void* blas_diag );
void          FLA_Param_map_flame_to_netlib_direct( FLA_Direct direct, void* lapack_direct );
void          FLA_Param_map_flame_to_netlib_storev( FLA_Store storev, void* lapack_storev );
void          FLA_Param_map_flame_to_netlib_evd_type( FLA_Evd_type evd_type, void* lapack_evd_type );
void          FLA_Param_map_flame_to_netlib_svd_type( FLA_Svd_type svd_type, void* lapack_svd_type );
void          FLA_Param_map_flame_to_netlib_machval( FLA_Machval machval, void* blas_machval );

#ifdef FLA_ENABLE_HIP
rocblas_operation FLA_Param_map_flame_to_rocblas_trans( FLA_Trans trans, FLA_Bool is_real );
rocblas_fill      FLA_Param_map_flame_to_rocblas_uplo( FLA_Uplo uplo );
rocblas_side      FLA_Param_map_flame_to_rocblas_side( FLA_Side side );
rocblas_diagonal  FLA_Param_map_flame_to_rocblas_diag( FLA_Diag diag );
rocblas_evect     FLA_Param_map_flame_to_rocblas_evd_type( FLA_Evd_type evd_type );
rocblas_svect     FLA_Param_map_flame_to_rocblas_svd_type( FLA_Svd_type svd_type );
#endif

void          FLA_Param_map_flame_to_blis_trans( FLA_Trans trans, trans1_t* blis_trans );
void          FLA_Param_map_flame_to_blis_conj( FLA_Conj conj, conj1_t* blis_conj );
void          FLA_Param_map_flame_to_blis_uplo( FLA_Uplo uplo, uplo1_t* blis_uplo );
void          FLA_Param_map_flame_to_blis_side( FLA_Uplo side, side1_t* blis_side );
void          FLA_Param_map_flame_to_blis_diag( FLA_Diag diag, diag1_t* blis_diag );
#if 0
void          FLA_Param_map_flame_to_blis2_trans( FLA_Trans trans, trans_t* blis_trans );
void          FLA_Param_map_flame_to_blis2_conj( FLA_Conj conj, conj_t* blis_conj );
void          FLA_Param_map_flame_to_blis2_uplo( FLA_Uplo uplo, uplo_t* blis_uplo );
void          FLA_Param_map_flame_to_blis2_side( FLA_Uplo side, side_t* blis_side );
void          FLA_Param_map_flame_to_blis2_diag( FLA_Diag diag, diag_t* blis_diag );
#endif

void          FLA_Param_map_blis_to_flame_trans( trans1_t trans, FLA_Trans* flame_trans );
void          FLA_Param_map_blis_to_flame_uplo( uplo1_t uplo, FLA_Uplo* flame_uplo );
void          FLA_Param_map_blis_to_flame_side( side1_t side, FLA_Side* flame_side );
void          FLA_Param_map_blis_to_flame_diag( diag1_t diag, FLA_Diag* flame_diag );

void          FLA_Param_map_char_to_flame_trans( char* trans, FLA_Trans* flame_trans );
void          FLA_Param_map_char_to_flame_uplo( char* uplo, FLA_Uplo* flame_uplo );
void          FLA_Param_map_char_to_flame_side( char* side, FLA_Side* flame_side );
void          FLA_Param_map_char_to_flame_diag( char* diag, FLA_Diag* flame_diag );
void          FLA_Param_map_char_to_flame_storev( char* storev, FLA_Direct* flame_storev );
void          FLA_Param_map_char_to_flame_direct( char* direct, FLA_Direct* flame_direct );
void          FLA_Param_map_char_to_flame_inv( char* inv, FLA_Inv* flame_inv );



void          FLA_Param_map_netlib_to_flame_trans( char* trans, FLA_Trans* flame_trans );
void          FLA_Param_map_netlib_to_flame_uplo( char* uplo, FLA_Uplo* flame_uplo );
void          FLA_Param_map_netlib_to_flame_side( char* side, FLA_Side* flame_side );
void          FLA_Param_map_netlib_to_flame_diag( char* diag, FLA_Diag* flame_diag );
void          FLA_Param_map_netlib_to_flame_inv( int* itype, FLA_Inv* flame_inv );
void          FLA_Param_map_netlib_to_flame_svd_type( char* svd, FLA_Svd_type* flame_svd );




// -----------------------------------------------------------------------------

FLA_Error     FLA_Part_2x2( FLA_Obj A,  FLA_Obj *A11, FLA_Obj *A12,
                                        FLA_Obj *A21, FLA_Obj *A22,
                            dim_t  mb,  dim_t     nb, FLA_Quadrant quadrant );

FLA_Error     FLA_Part_2x1 ( FLA_Obj A,  FLA_Obj *A1,
                                         FLA_Obj *A2,
                             dim_t  mb,  FLA_Side side );

FLA_Error     FLA_Part_1x2( FLA_Obj A,  FLA_Obj *A1, FLA_Obj *A2,
                                        dim_t    nb, FLA_Side side );
 
FLA_Error     FLA_Merge_2x2( FLA_Obj A11, FLA_Obj A12,
                             FLA_Obj A21, FLA_Obj A22,  FLA_Obj *A );
 
FLA_Error     FLA_Merge_2x1( FLA_Obj AT,
                             FLA_Obj AB,  FLA_Obj *A );

FLA_Error     FLA_Merge_1x2( FLA_Obj AL, FLA_Obj AR,  FLA_Obj *A );

FLA_Error     FLA_Repart_2x2_to_3x3( FLA_Obj ATL, FLA_Obj ATR,  FLA_Obj *A00, FLA_Obj *A01, FLA_Obj *A02,
                                                                FLA_Obj *A10, FLA_Obj *A11, FLA_Obj *A12,
                                     FLA_Obj ABL, FLA_Obj ABR,  FLA_Obj *A20, FLA_Obj *A21, FLA_Obj *A22,
                                     dim_t   mb,  dim_t    nb,  FLA_Quadrant quadrant );

FLA_Error     FLA_Repart_2x1_to_3x1( FLA_Obj AT,  FLA_Obj *A0,
                                                  FLA_Obj *A1,
                                     FLA_Obj AB,  FLA_Obj *A2,
                                     dim_t   mb,  FLA_Side side );

FLA_Error     FLA_Repart_1x2_to_1x3( FLA_Obj  AL,              FLA_Obj  AR,
                                     FLA_Obj *A0, FLA_Obj *A1, FLA_Obj *A2,
                                                  dim_t    nb, FLA_Side side );

FLA_Error     FLA_Cont_with_3x3_to_2x2( FLA_Obj *ATL, FLA_Obj *ATR,  FLA_Obj A00, FLA_Obj A01, FLA_Obj A02,
                                                                     FLA_Obj A10, FLA_Obj A11, FLA_Obj A12,
                                        FLA_Obj *ABL, FLA_Obj *ABR,  FLA_Obj A20, FLA_Obj A21, FLA_Obj A22,
                                                                     FLA_Quadrant quadrant );

FLA_Error     FLA_Cont_with_3x1_to_2x1( FLA_Obj *AT,  FLA_Obj A0,
                                                      FLA_Obj A1,
                                        FLA_Obj *AB,  FLA_Obj A2,
                                                      FLA_Side side );

FLA_Error     FLA_Cont_with_1x3_to_1x2( FLA_Obj *AL,              FLA_Obj *AR,
                                        FLA_Obj  A0, FLA_Obj  A1, FLA_Obj  A2,
                                                                  FLA_Side side );

FLA_Error     FLA_Repart_3x3_to_5x5( FLA_Obj ATL, FLA_Obj ATM, FLA_Obj ATR,
                                     FLA_Obj AML, FLA_Obj AMM, FLA_Obj AMR,
                                     FLA_Obj ABL, FLA_Obj ABM, FLA_Obj ABR,
                                     FLA_Obj *A00, FLA_Obj *A01, FLA_Obj *A02, FLA_Obj *A03, FLA_Obj *A04,
                                     FLA_Obj *A10, FLA_Obj *A11, FLA_Obj *A12, FLA_Obj *A13, FLA_Obj *A14,
                                     FLA_Obj *A20, FLA_Obj *A21, FLA_Obj *A22, FLA_Obj *A23, FLA_Obj *A24,
                                     FLA_Obj *A30, FLA_Obj *A31, FLA_Obj *A32, FLA_Obj *A33, FLA_Obj *A34,
                                     FLA_Obj *A40, FLA_Obj *A41, FLA_Obj *A42, FLA_Obj *A43, FLA_Obj *A44,
                                     dim_t b, FLA_Quadrant quadrant );

FLA_Error     FLA_Cont_with_5x5_to_3x3( FLA_Obj *ATL, FLA_Obj *ATM, FLA_Obj *ATR,
                                        FLA_Obj *AML, FLA_Obj *AMM, FLA_Obj *AMR,
                                        FLA_Obj *ABL, FLA_Obj *ABM, FLA_Obj *ABR,
                                        FLA_Obj A00, FLA_Obj A01, FLA_Obj A02, FLA_Obj A03, FLA_Obj A04,
                                        FLA_Obj A10, FLA_Obj A11, FLA_Obj A12, FLA_Obj A13, FLA_Obj A14,
                                        FLA_Obj A20, FLA_Obj A21, FLA_Obj A22, FLA_Obj A23, FLA_Obj A24,
                                        FLA_Obj A30, FLA_Obj A31, FLA_Obj A32, FLA_Obj A33, FLA_Obj A34,
                                        FLA_Obj A40, FLA_Obj A41, FLA_Obj A42, FLA_Obj A43, FLA_Obj A44,
                                        FLA_Quadrant quadrant );



FLA_Error     FLA_Part_2x2_check( FLA_Obj A,  FLA_Obj *A11, FLA_Obj *A12,
                                              FLA_Obj *A21, FLA_Obj *A22,
                                  dim_t  mb,  dim_t     nb, FLA_Quadrant quadrant );

FLA_Error     FLA_Part_2x1_check( FLA_Obj A,  FLA_Obj *A1,
                                               FLA_Obj *A2,
                                   dim_t  mb,  FLA_Side side );

FLA_Error     FLA_Part_1x2_check( FLA_Obj A,  FLA_Obj *A1, FLA_Obj *A2,
                                              dim_t    nb, FLA_Side side );
 
FLA_Error     FLA_Merge_2x2_check( FLA_Obj A11, FLA_Obj A12,
                                   FLA_Obj A21, FLA_Obj A22,  FLA_Obj *A );
 
FLA_Error     FLA_Merge_2x1_check( FLA_Obj AT,
                                   FLA_Obj AB,  FLA_Obj *A );

FLA_Error     FLA_Merge_1x2_check( FLA_Obj AL, FLA_Obj AR,  FLA_Obj *A );

FLA_Error     FLA_Repart_2x2_to_3x3_check( FLA_Obj ATL, FLA_Obj ATR,  FLA_Obj *A00, FLA_Obj *A01, FLA_Obj *A02,
                                                                      FLA_Obj *A10, FLA_Obj *A11, FLA_Obj *A12,
                                           FLA_Obj ABL, FLA_Obj ABR,  FLA_Obj *A20, FLA_Obj *A21, FLA_Obj *A22,
                                           dim_t   mb,  dim_t    nb,  FLA_Quadrant quadrant );

FLA_Error     FLA_Repart_2x1_to_3x1_check( FLA_Obj AT,  FLA_Obj *A0,
                                                        FLA_Obj *A1,
                                           FLA_Obj AB,  FLA_Obj *A2,
                                           dim_t   mb,  FLA_Side side );

FLA_Error     FLA_Repart_1x2_to_1x3_check( FLA_Obj  AL,              FLA_Obj  AR,
                                           FLA_Obj *A0, FLA_Obj *A1, FLA_Obj *A2,
                                                        dim_t    nb, FLA_Side side );

FLA_Error     FLA_Cont_with_3x3_to_2x2_check( FLA_Obj *ATL, FLA_Obj *ATR,  FLA_Obj A00, FLA_Obj A01, FLA_Obj A02,
                                                                           FLA_Obj A10, FLA_Obj A11, FLA_Obj A12,
                                              FLA_Obj *ABL, FLA_Obj *ABR,  FLA_Obj A20, FLA_Obj A21, FLA_Obj A22,
                                                                           FLA_Quadrant quadrant );

FLA_Error     FLA_Cont_with_3x1_to_2x1_check( FLA_Obj *AT,  FLA_Obj A0,
                                                            FLA_Obj A1,
                                              FLA_Obj *AB,  FLA_Obj A2,
                                                            FLA_Side side );

FLA_Error     FLA_Cont_with_1x3_to_1x2_check( FLA_Obj *AL,              FLA_Obj *AR,
                                              FLA_Obj  A0, FLA_Obj  A1, FLA_Obj  A2,
                                                                        FLA_Side side );
// end FLA_main_prototypes.h
// begin FLA_util_base_prototypes.h


float     FLA_random_float( void );
double    FLA_random_double( void );
scomplex  FLA_random_scomplex( void );
dcomplex  FLA_random_dcomplex( void );

FLA_Error FLA_Absolute_square( FLA_Obj alpha );
FLA_Error FLA_Absolute_value( FLA_Obj alpha );
double    FLA_Clock( void );
FLA_Error FLA_Conjugate( FLA_Obj A );
FLA_Error FLA_Conjugate_r( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Fill_with_linear_dist( FLA_Obj shift, FLA_Obj delta, FLA_Obj x );
FLA_Error FLA_Fill_with_inverse_dist( FLA_Obj alpha, FLA_Obj x );
FLA_Error FLA_Fill_with_geometric_dist( FLA_Obj alpha, FLA_Obj x );
FLA_Error FLA_Fill_with_random_dist( FLA_Obj shift, FLA_Obj max, FLA_Obj x );
FLA_Error FLA_Fill_with_logarithmic_dist( FLA_Obj max, FLA_Obj x );
FLA_Error FLA_Fill_with_cluster_dist( FLA_Obj n_clusters, FLA_Obj cluster_width, FLA_Obj x );
FLA_Error FLA_Hermitianize( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Invert( FLA_Conj conj, FLA_Obj x );
FLA_Error FLA_Inv_scal_elemwise( FLA_Trans trans, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Max_abs_value( FLA_Obj A, FLA_Obj amax );
FLA_Error FLA_Max_abs_value_herm( FLA_Uplo uplo, FLA_Obj A, FLA_Obj maxabs );
double    FLA_Max_elemwise_diff( FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Mult_add( FLA_Obj alpha, FLA_Obj beta, FLA_Obj gamma );
FLA_Error FLA_Negate( FLA_Obj x );
FLA_Error FLA_Norm1( FLA_Obj A, FLA_Obj norm );
FLA_Error FLA_Norm_inf( FLA_Obj A, FLA_Obj norm );
FLA_Error FLA_Norm_frob( FLA_Obj A, FLA_Obj norm );
FLA_Error FLA_Pow( FLA_Obj base, FLA_Obj exp, FLA_Obj btoe );
FLA_Error FLA_Random_matrix( FLA_Obj A );
FLA_Error FLA_Random_herm_matrix( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Random_symm_matrix( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Random_spd_matrix( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Random_tri_matrix( FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A );
FLA_Error FLA_Random_unitary_matrix( FLA_Obj A );
FLA_Error FLA_Scal_elemwise( FLA_Trans trans, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Setr( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Shift_pivots_to_check( FLA_Pivot_type ptype, FLA_Obj p );
FLA_Error FLA_Sqrt( FLA_Obj alpha );
FLA_Error FLA_Symmetrize( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Triangularize( FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A );
FLA_Error FLA_Transpose( FLA_Obj A );

FLA_Error FLA_Set( FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Set_diag( FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Set_offdiag( int offset, FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Set_to_identity( FLA_Obj A );
FLA_Error FLA_Add_to_diag( void *diag_value, FLA_Obj A );
FLA_Error FLA_Shift_diag( FLA_Conj conj, FLA_Obj sigma, FLA_Obj A );
FLA_Error FLA_Scale_diag( FLA_Conj conj, FLA_Obj alpha, FLA_Obj A );

FLA_Error FLA_Set_diagonal_vector( FLA_Obj A, FLA_Obj d );
FLA_Error FLA_Set_diagonal_matrix( FLA_Obj d, FLA_Obj A );

// -----------------------------------------------------------------------------

FLA_Error FLA_Absolute_square_check( FLA_Obj alpha );
FLA_Error FLA_Absolute_value_check( FLA_Obj alpha );
FLA_Error FLA_Conjugate_check( FLA_Obj A );
FLA_Error FLA_Conjugate_r_check( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Fill_with_linear_dist_check( FLA_Obj shift, FLA_Obj delta, FLA_Obj x );
FLA_Error FLA_Fill_with_inverse_dist_check( FLA_Obj alpha, FLA_Obj x );
FLA_Error FLA_Fill_with_geometric_dist_check( FLA_Obj alpha, FLA_Obj x );
FLA_Error FLA_Fill_with_random_dist_check( FLA_Obj shift, FLA_Obj max, FLA_Obj x );
FLA_Error FLA_Fill_with_logarithmic_dist_check( FLA_Obj alpha, FLA_Obj x );
FLA_Error FLA_Fill_with_cluster_dist_check( FLA_Obj n_clusters, FLA_Obj cluster_width, FLA_Obj x );
FLA_Error FLA_Hermitianize_check( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Invert_check( FLA_Conj conj, FLA_Obj x );
FLA_Error FLA_Inv_scal_elemwise_check( FLA_Trans trans, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Max_abs_value_check( FLA_Obj A, FLA_Obj amax );
FLA_Error FLA_Max_abs_value_herm_check( FLA_Uplo uplo, FLA_Obj A, FLA_Obj maxabs );
FLA_Error FLA_Max_elemwise_diff_check( FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Mult_add_check( FLA_Obj alpha, FLA_Obj beta, FLA_Obj gamma );
FLA_Error FLA_Negate_check( FLA_Obj x );
FLA_Error FLA_Norm1_check( FLA_Obj A, FLA_Obj norm );
FLA_Error FLA_Norm_inf_check( FLA_Obj A, FLA_Obj norm );
FLA_Error FLA_Norm_frob_check( FLA_Obj A, FLA_Obj norm );
FLA_Error FLA_Pow_check( FLA_Obj base, FLA_Obj exp, FLA_Obj btoe );
FLA_Error FLA_Random_matrix_check( FLA_Obj A );
FLA_Error FLA_Random_herm_matrix_check( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Random_symm_matrix_check( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Random_spd_matrix_check( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Random_tri_matrix_check( FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A );
FLA_Error FLA_Random_unitary_matrix_check( FLA_Obj A );
FLA_Error FLA_Scal_elemwise_check( FLA_Trans trans, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Setr_check( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Sort_check( FLA_Direct direct, FLA_Obj x );
FLA_Error FLA_Sqrt_check( FLA_Obj alpha );
FLA_Error FLA_Symmetrize_check( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Triangularize_check( FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A );
FLA_Error FLA_Transpose_check( FLA_Obj A );

FLA_Error FLA_Set_check( FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Set_diag_check( FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Set_to_identity_check( FLA_Obj A );
FLA_Error FLA_Add_to_diag_check( void *diag_value, FLA_Obj A );
FLA_Error FLA_Shift_diag_check( FLA_Conj conj, FLA_Obj sigma, FLA_Obj A );
FLA_Error FLA_Scale_diag_check( FLA_Conj conj, FLA_Obj alpha, FLA_Obj A );

// -----------------------------------------------------------------------------

FLA_Error FLA_Transpose_blk_var1( FLA_Obj A, fla_tpose_t* cntl );
FLA_Error FLA_Transpose_blk_var2( FLA_Obj A, fla_tpose_t* cntl );
FLA_Error FLA_Transpose_unb_var1( FLA_Obj A );
FLA_Error FLA_Transpose_unb_var2( FLA_Obj A );
FLA_Error FLA_Swap_t_blk_var1( FLA_Obj A, FLA_Obj B, fla_swap_t* cntl );
FLA_Error FLA_Swap_t_blk_var2( FLA_Obj A, FLA_Obj B, fla_swap_t* cntl );

FLA_Error FLA_Sort( FLA_Direct direct, FLA_Obj x );
FLA_Error FLA_Sort_f_ops( int     m_x,
                          float*  x, int inc_x );
FLA_Error FLA_Sort_b_ops( int     m_x,
                          float*  x, int inc_x );
FLA_Error FLA_Sort_f_opd( int     m_x,
                          double* x, int inc_x );
FLA_Error FLA_Sort_b_opd( int     m_x,
                          double* x, int inc_x );

// end FLA_util_base_prototypes.h
// begin FLA_util_lapack_prototypes.h


// --- LAPACK-related utility prototypes ---------------------------------------

FLA_Error FLA_Househ2_UT( FLA_Side side, FLA_Obj chi_1, FLA_Obj x2, FLA_Obj tau );
FLA_Error FLA_Househ2_UT_l_ops( int m_x2,
                                float* chi_1,
                                float* x2, int inc_x2,
                                float* tau );
FLA_Error FLA_Househ2_UT_l_opd( int m_x2,
                                double* chi_1,
                                double* x2, int inc_x2,
                                double* tau );
FLA_Error FLA_Househ2_UT_l_opc( int m_x2,
                                scomplex* chi_1,
                                scomplex* x2, int inc_x2,
                                scomplex* tau );
FLA_Error FLA_Househ2_UT_l_opz( int m_x2,
                                dcomplex* chi_1,
                                dcomplex* x2, int inc_x2,
                                dcomplex* tau );
FLA_Error FLA_Househ2_UT_r_ops( int m_x2,
                                float* chi_1,
                                float* x2, int inc_x2,
                                float* tau );
FLA_Error FLA_Househ2_UT_r_opd( int m_x2,
                                double* chi_1,
                                double* x2, int inc_x2,
                                double* tau );
FLA_Error FLA_Househ2_UT_r_opc( int m_x2,
                                scomplex* chi_1,
                                scomplex* x2, int inc_x2,
                                scomplex* tau );
FLA_Error FLA_Househ2_UT_r_opz( int m_x2,
                                dcomplex* chi_1,
                                dcomplex* x2, int inc_x2,
                                dcomplex* tau );

FLA_Error FLA_Househ3UD_UT( FLA_Obj chi_1, FLA_Obj x2, FLA_Obj y2, FLA_Obj tau );
FLA_Error FLA_Househ3UD_UT_ops( int m_x2,
                                int m_y2,
                                float* chi_1,
                                float* x2, int inc_x2,
                                float* y2, int inc_y2,
                                float* tau );
FLA_Error FLA_Househ3UD_UT_opd( int m_x2,
                                int m_y2,
                                double* chi_1,
                                double* x2, int inc_x2,
                                double* y2, int inc_y2,
                                double* tau );
FLA_Error FLA_Househ3UD_UT_opc( int m_x2,
                                int m_y2,
                                scomplex* chi_1,
                                scomplex* x2, int inc_x2,
                                scomplex* y2, int inc_y2,
                                scomplex* tau );
FLA_Error FLA_Househ3UD_UT_opz( int m_x2,
                                int m_y2,
                                dcomplex* chi_1,
                                dcomplex* x2, int inc_x2,
                                dcomplex* y2, int inc_y2,
                                dcomplex* tau );

FLA_Error FLA_Househ2s_UT( FLA_Side side, FLA_Obj chi_1, FLA_Obj x2, FLA_Obj alpha, FLA_Obj chi_1_minus_alpha, FLA_Obj tau );
FLA_Error FLA_Househ2s_UT_l_ops( int    m_x2,
                                 float* chi_1,
                                 float* x2, int inc_x2,
                                 float* alpha,
                                 float* chi_1_minus_alpha,
                                 float* tau );
FLA_Error FLA_Househ2s_UT_l_opd( int     m_x2,
                                 double* chi_1,
                                 double* x2, int inc_x2,
                                 double* alpha,
                                 double* chi_1_minus_alpha,
                                 double* tau );
FLA_Error FLA_Househ2s_UT_l_opc( int       m_x2,
                                 scomplex* chi_1,
                                 scomplex* x2, int inc_x2,
                                 scomplex* alpha,
                                 scomplex* chi_1_minus_alpha,
                                 scomplex* tau );
FLA_Error FLA_Househ2s_UT_l_opz( int       m_x2,
                                 dcomplex* chi_1,
                                 dcomplex* x2, int inc_x2,
                                 dcomplex* alpha,
                                 dcomplex* chi_1_minus_alpha,
                                 dcomplex* tau );
FLA_Error FLA_Househ2s_UT_r_ops( int    m_x2,
                                 float* chi_1,
                                 float* x2, int inc_x2,
                                 float* alpha,
                                 float* chi_1_minus_alpha,
                                 float* tau );
FLA_Error FLA_Househ2s_UT_r_opd( int     m_x2,
                                 double* chi_1,
                                 double* x2, int inc_x2,
                                 double* alpha,
                                 double* chi_1_minus_alpha,
                                 double* tau );
FLA_Error FLA_Househ2s_UT_r_opc( int       m_x2,
                                 scomplex* chi_1,
                                 scomplex* x2, int inc_x2,
                                 scomplex* alpha,
                                 scomplex* chi_1_minus_alpha,
                                 scomplex* tau );
FLA_Error FLA_Househ2s_UT_r_opz( int       m_x2,
                                 dcomplex* chi_1,
                                 dcomplex* x2, int inc_x2,
                                 dcomplex* alpha,
                                 dcomplex* chi_1_minus_alpha,
                                 dcomplex* tau );

FLA_Error FLA_Hev_2x2( FLA_Obj alpha11, FLA_Obj alpha21, FLA_Obj alpha22,
                       FLA_Obj lambda1, FLA_Obj lambda2 );
FLA_Error FLA_Hev_2x2_ops( float*    buff_alpha11,
                           float*    buff_alpha21,
                           float*    buff_alpha22,
                           float*    buff_lambda1,
                           float*    buff_lambda2 );
FLA_Error FLA_Hev_2x2_opd( double*   buff_alpha11,
                           double*   buff_alpha21,
                           double*   buff_alpha22,
                           double*   buff_lambda1,
                           double*   buff_lambda2 );

FLA_Error FLA_Hevv_2x2( FLA_Obj alpha11, FLA_Obj alpha21, FLA_Obj alpha22,
                        FLA_Obj lambda1, FLA_Obj lambda2,
                        FLA_Obj gamma1,  FLA_Obj sigma1 );
FLA_Error FLA_Hevv_2x2_ops( float*    alpha11,
                            float*    alpha21,
                            float*    alpha22,
                            float*    lambda1,
                            float*    lambda2,
                            float*    gamma1,
                            float*    sigma1 );
FLA_Error FLA_Hevv_2x2_opd( double*   alpha11,
                            double*   alpha21,
                            double*   alpha22,
                            double*   lambda1,
                            double*   lambda2,
                            double*   gamma1,
                            double*   sigma1 );
FLA_Error FLA_Hevv_2x2_opc( scomplex* alpha11,
                            scomplex* alpha21,
                            scomplex* alpha22,
                            float*    lambda1,
                            float*    lambda2,
                            float*    gamma1,
                            scomplex* sigma1 );
FLA_Error FLA_Hevv_2x2_opz( dcomplex* alpha11,
                            dcomplex* alpha21,
                            dcomplex* alpha22,
                            double*   lambda1,
                            double*   lambda2,
                            double*   gamma1,
                            dcomplex* sigma1 );

FLA_Error FLA_Wilkshift_tridiag( FLA_Obj delta1, FLA_Obj epsilon, FLA_Obj delta2, FLA_Obj kappa );
FLA_Error FLA_Wilkshift_tridiag_ops( float   delta1,
                                     float   epsilon,
                                     float   delta2,
                                     float*  kappa );
FLA_Error FLA_Wilkshift_tridiag_opd( double  delta1,
                                     double  epsilon,
                                     double  delta2,
                                     double* kappa );

FLA_Error FLA_Pythag2( FLA_Obj chi, FLA_Obj psi, FLA_Obj rho );
FLA_Error FLA_Pythag2_ops( float*    chi,
                           float*    psi,
                           float*    rho );
FLA_Error FLA_Pythag2_opd( double*   chi,
                           double*   psi,
                           double*   rho );

FLA_Error FLA_Pythag3( FLA_Obj chi, FLA_Obj psi, FLA_Obj zeta, FLA_Obj rho );
FLA_Error FLA_Pythag3_ops( float*    chi,
                           float*    psi,
                           float*    zeta,
                           float*    rho );
FLA_Error FLA_Pythag3_opd( double*   chi,
                           double*   psi,
                           double*   zeta,
                           double*   rho );

FLA_Error FLA_Sort_evd( FLA_Direct direct, FLA_Obj l, FLA_Obj V );
FLA_Error FLA_Sort_evd_f_ops( int       m_A,
                              float*    l, int inc_l,
                              float*    V, int rs_V, int cs_V );
FLA_Error FLA_Sort_evd_b_ops( int       m_A,
                              float*    l, int inc_l,
                              float*    V, int rs_V, int cs_V );
FLA_Error FLA_Sort_evd_f_opd( int       m_A,
                              double*   l, int inc_l,
                              double*   V, int rs_V, int cs_V );
FLA_Error FLA_Sort_evd_b_opd( int       m_A,
                              double*   l, int inc_l,
                              double*   V, int rs_V, int cs_V );
FLA_Error FLA_Sort_evd_f_opc( int       m_A,
                              float*    l, int inc_l,
                              scomplex* V, int rs_V, int cs_V );
FLA_Error FLA_Sort_evd_b_opc( int       m_A,
                              float*    l, int inc_l,
                              scomplex* V, int rs_V, int cs_V );
FLA_Error FLA_Sort_evd_f_opz( int       m_A,
                              double*   l, int inc_l,
                              dcomplex* V, int rs_V, int cs_V );
FLA_Error FLA_Sort_evd_b_opz( int       m_A,
                              double*   l, int inc_l,
                              dcomplex* V, int rs_V, int cs_V );

FLA_Error FLA_Sort_bsvd_ext( FLA_Direct direct, FLA_Obj s,
                             FLA_Bool apply_U, FLA_Obj U,
                             FLA_Bool apply_V, FLA_Obj V,
                             FLA_Bool apply_C, FLA_Obj C );
FLA_Error FLA_Sort_bsvd_ext_f_ops( int m_s, float* s, int inc_s,
                                   int m_U, float* U, int rs_U, int cs_U,
                                   int m_V, float* V, int rs_V, int cs_V,
                                   int n_C, float* C, int rs_C, int cs_C );
FLA_Error FLA_Sort_bsvd_ext_b_ops( int m_s, float* s, int inc_s,
                                   int m_U, float* U, int rs_U, int cs_U,
                                   int m_V, float* V, int rs_V, int cs_V,
                                   int n_C, float* C, int rs_C, int cs_C );
FLA_Error FLA_Sort_bsvd_ext_f_opd( int m_s, double* s, int inc_s,
                                   int m_U, double* U, int rs_U, int cs_U,
                                   int m_V, double* V, int rs_V, int cs_V,
                                   int n_C, double* C, int rs_C, int cs_C );
FLA_Error FLA_Sort_bsvd_ext_b_opd( int m_s, double* s, int inc_s,
                                   int m_U, double* U, int rs_U, int cs_U,
                                   int m_V, double* V, int rs_V, int cs_V,
                                   int n_C, double* C, int rs_C, int cs_C );
FLA_Error FLA_Sort_bsvd_ext_f_opc( int m_s, float*    s, int inc_s,
                                   int m_U, scomplex* U, int rs_U, int cs_U,
                                   int m_V, scomplex* V, int rs_V, int cs_V,
                                   int n_C, scomplex* C, int rs_C, int cs_C );
FLA_Error FLA_Sort_bsvd_ext_b_opc( int m_s, float*    s, int inc_s,
                                   int m_U, scomplex* U, int rs_U, int cs_U,
                                   int m_V, scomplex* V, int rs_V, int cs_V,
                                   int n_C, scomplex* C, int rs_C, int cs_C );
FLA_Error FLA_Sort_bsvd_ext_f_opz( int m_s, double*   s, int inc_s,
                                   int m_U, dcomplex* U, int rs_U, int cs_U,
                                   int m_V, dcomplex* V, int rs_V, int cs_V,
                                   int n_C, dcomplex* C, int rs_C, int cs_C );
FLA_Error FLA_Sort_bsvd_ext_b_opz( int m_s, double*   s, int inc_s,
                                   int m_U, dcomplex* U, int rs_U, int cs_U,
                                   int m_V, dcomplex* V, int rs_V, int cs_V,
                                   int n_C, dcomplex* C, int rs_C, int cs_C );

FLA_Error FLA_Sort_svd( FLA_Direct direct, FLA_Obj s, FLA_Obj U, FLA_Obj V );
FLA_Error FLA_Sort_svd_f_ops( int       m_U,
                              int       n_V,
                              float*    s, int inc_s,
                              float*    U, int rs_U, int cs_U,
                              float*    V, int rs_V, int cs_V );
FLA_Error FLA_Sort_svd_b_ops( int       m_U,
                              int       n_V,
                              float*    s, int inc_s,
                              float*    U, int rs_U, int cs_U,
                              float*    V, int rs_V, int cs_V );
FLA_Error FLA_Sort_svd_f_opd( int       m_U,
                              int       n_V,
                              double*   s, int inc_s,
                              double*   U, int rs_U, int cs_U,
                              double*   V, int rs_V, int cs_V );
FLA_Error FLA_Sort_svd_b_opd( int       m_U,
                              int       n_V,
                              double*   s, int inc_s,
                              double*   U, int rs_U, int cs_U,
                              double*   V, int rs_V, int cs_V );
FLA_Error FLA_Sort_svd_f_opc( int       m_U,
                              int       n_V,
                              float*    s, int inc_s,
                              scomplex* U, int rs_U, int cs_U,
                              scomplex* V, int rs_V, int cs_V );
FLA_Error FLA_Sort_svd_b_opc( int       m_U,
                              int       n_V,
                              float*    s, int inc_s,
                              scomplex* U, int rs_U, int cs_U,
                              scomplex* V, int rs_V, int cs_V );
FLA_Error FLA_Sort_svd_f_opz( int       m_U,
                              int       n_V,
                              double*   s, int inc_s,
                              dcomplex* U, int rs_U, int cs_U,
                              dcomplex* V, int rs_V, int cs_V );
FLA_Error FLA_Sort_svd_b_opz( int       m_U,
                              int       n_V,
                              double*   s, int inc_s,
                              dcomplex* U, int rs_U, int cs_U,
                              dcomplex* V, int rs_V, int cs_V );

FLA_Error FLA_Sv_2x2( FLA_Obj alpha11, FLA_Obj alpha12, FLA_Obj alpha22,
                      FLA_Obj sigma1, FLA_Obj sigma2 );
FLA_Error FLA_Sv_2x2_ops( float*    alpha11,
                          float*    alpha12,
                          float*    alpha22,
                          float*    sigma1,
                          float*    sigma2 );
FLA_Error FLA_Sv_2x2_opd( double*   alpha11,
                          double*   alpha12,
                          double*   alpha22,
                          double*   sigma1,
                          double*   sigma2 );

FLA_Error FLA_Svv_2x2( FLA_Obj alpha11, FLA_Obj alpha12, FLA_Obj alpha22,
                       FLA_Obj sigma1, FLA_Obj sigma2,
                       FLA_Obj gammaL, FLA_Obj sigmaL,
                       FLA_Obj gammaR, FLA_Obj sigmaR );
FLA_Error FLA_Svv_2x2_ops( float*    alpha11,
                           float*    alpha12,
                           float*    alpha22,
                           float*    sigma1,
                           float*    sigma2,
                           float*    gammaL,
                           float*    sigmaL,
                           float*    gammaR,
                           float*    sigmaR );
FLA_Error FLA_Svv_2x2_opd( double*   alpha11,
                           double*   alpha12,
                           double*   alpha22,
                           double*   sigma1,
                           double*   sigma2,
                           double*   gammaL,
                           double*   sigmaL,
                           double*   gammaR,
                           double*   sigmaR );

FLA_Error FLA_Mach_params( FLA_Machval machval, FLA_Obj val );
float     FLA_Mach_params_ops( FLA_Machval machval );
double    FLA_Mach_params_opd( FLA_Machval machval );

FLA_Error FLA_Apply_diag_matrix( FLA_Side side, FLA_Conj conj, FLA_Obj x, FLA_Obj A );
FLA_Error FLA_Shift_pivots_to( FLA_Pivot_type ptype, FLA_Obj p );
FLA_Error FLA_Form_perm_matrix( FLA_Obj p, FLA_Obj A );
FLA_Error FLA_LU_find_zero_on_diagonal( FLA_Obj A );

// --- f2c-converted routine prototypes ----------------------------------------

doublereal fla_dlamch( char* cmach, ftnlen cmach_len );
real       fla_slamch( char* cmach, ftnlen cmach_len );
logical    fla_lsame( char* ca, char* cb, ftnlen ca_len, ftnlen cb_len );
double     fla_pow_di( doublereal* a, integer* n );
real       fla_pow_ri( real* a, integer* n );

// --- LAPACK-related utility check routine prototypes -------------------------

FLA_Error FLA_Househ2_UT_check( FLA_Side side, FLA_Obj chi_1, FLA_Obj x2, FLA_Obj tau );
FLA_Error FLA_Househ3UD_UT_check( FLA_Obj chi_1, FLA_Obj x2, FLA_Obj y2, FLA_Obj tau );
FLA_Error FLA_Househ2s_UT_check( FLA_Side side, FLA_Obj chi_1, FLA_Obj x2, FLA_Obj alpha, FLA_Obj chi_1_minus_alpha, FLA_Obj tau );

FLA_Error FLA_Givens2_check( FLA_Obj chi_1, FLA_Obj chi_2, FLA_Obj gamma, FLA_Obj sigma, FLA_Obj chi_1_new );
FLA_Error FLA_Apply_GTG_check( FLA_Obj gamma, FLA_Obj sigma, FLA_Obj delta1, FLA_Obj epsilon1, FLA_Obj delta2 );
FLA_Error FLA_Apply_G_1x2_check( FLA_Obj gamma, FLA_Obj sigma, FLA_Obj beta, FLA_Obj epsilon );
FLA_Error FLA_Apply_G_mx2_check( FLA_Obj gamma, FLA_Obj sigma, FLA_Obj a1, FLA_Obj a2 );
FLA_Error FLA_Apply_G_check( FLA_Side side, FLA_Direct direct, FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Wilkshift_tridiag_check( FLA_Obj delta1, FLA_Obj epsilon, FLA_Obj delta2, FLA_Obj kappa );
FLA_Error FLA_Wilkshift_bidiag_check( FLA_Obj epsilon1, FLA_Obj delta1, FLA_Obj epsilon2, FLA_Obj delta2, FLA_Obj kappa );
FLA_Error FLA_Introduce_bulge_check( FLA_Obj shift, FLA_Obj gamma, FLA_Obj sigma, FLA_Obj delta1, FLA_Obj epsilon1, FLA_Obj delta2, FLA_Obj beta, FLA_Obj epsilon2 );
FLA_Error FLA_Mach_params_check( FLA_Machval machval, FLA_Obj val );

FLA_Error FLA_Sort_evd_check( FLA_Direct direct, FLA_Obj l, FLA_Obj V );
FLA_Error FLA_Sort_svd_check( FLA_Direct direct, FLA_Obj s, FLA_Obj U, FLA_Obj V );

FLA_Error FLA_Apply_diag_matrix_check( FLA_Side side, FLA_Conj conj, FLA_Obj x, FLA_Obj A );
FLA_Error FLA_Shift_pivots_to_check( FLA_Pivot_type ptype, FLA_Obj p );
FLA_Error FLA_Form_perm_matrix_check( FLA_Obj p, FLA_Obj A );
FLA_Error FLA_LU_find_zero_on_diagonal_check( FLA_Obj A );

// end FLA_util_lapack_prototypes.h

  // Include prototypes for FLAME interfaces to BLAS and LAPACK operations.
// begin FLA_blas1_prototypes.h


#ifdef FLA_ENABLE_HIP
#include <rocblas/rocblas.h> // skipped
#endif

// --- top-level wrapper prototypes --------------------------------------------

FLA_Error FLA_Asum( FLA_Obj x, FLA_Obj asum_x );
FLA_Error FLA_Axpy( FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Axpys( FLA_Obj alpha0, FLA_Obj alpha1, FLA_Obj A, FLA_Obj beta, FLA_Obj B );
FLA_Error FLA_Axpyt( FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Axpyrt( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Copy( FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Copyr( FLA_Uplo uplo, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Copyrt( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Copyt( FLA_Trans trans, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Copyr( FLA_Uplo uplo, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Dot( FLA_Obj x, FLA_Obj y, FLA_Obj rho );
FLA_Error FLA_Dot2cs( FLA_Conj conj, FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj beta, FLA_Obj rho );
FLA_Error FLA_Dot2s( FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj beta, FLA_Obj rho );
FLA_Error FLA_Dotc( FLA_Conj conj, FLA_Obj x, FLA_Obj y, FLA_Obj rho );
FLA_Error FLA_Dotcs( FLA_Conj conj, FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj beta, FLA_Obj rho );
FLA_Error FLA_Dots( FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj beta, FLA_Obj rho );
FLA_Error FLA_Amax( FLA_Obj x, FLA_Obj index );
FLA_Error FLA_Inv_scal( FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Inv_scalc( FLA_Conj conjalpha, FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Nrm2( FLA_Obj x, FLA_Obj norm_x );
FLA_Error FLA_Scal( FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Scalc( FLA_Conj conjalpha, FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Scalr( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Swap( FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Swapt( FLA_Trans trans, FLA_Obj A, FLA_Obj B );


// --- task wrapper prototypes -------------------------------------------------

FLA_Error FLA_Axpy_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpy_t* cntl );
FLA_Error FLA_Axpyt_task( FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Copy_task( FLA_Obj A, FLA_Obj B, fla_copy_t* cntl );
FLA_Error FLA_Copyt_task( FLA_Trans trans, FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyr_task( FLA_Uplo uplo, FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl );
FLA_Error FLA_Scal_task( FLA_Obj alpha, FLA_Obj A, fla_scal_t* cntl );
FLA_Error FLA_Scalr_task( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl );

FLA_Error FLA_Axpyt_n_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Axpyt_t_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Axpyt_c_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Axpyt_h_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );

FLA_Error FLA_Copyt_n_task( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyt_t_task( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyt_c_task( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyt_h_task( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );

FLA_Error FLA_Copyr_l_task( FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl );
FLA_Error FLA_Copyr_u_task( FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl );

FLA_Error FLA_Scalr_l_task( FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl );
FLA_Error FLA_Scalr_u_task( FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl );


// --- external wrapper prototypes ---------------------------------------------

FLA_Error FLA_Asum_external( FLA_Obj x, FLA_Obj asum_x );
FLA_Error FLA_Axpy_external( FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Axpys_external( FLA_Obj alpha0, FLA_Obj alpha1, FLA_Obj A, FLA_Obj beta, FLA_Obj B );
FLA_Error FLA_Axpyt_external( FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Axpyrt_external( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Copy_external( FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Copyr_external( FLA_Uplo uplo, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Copyrt_external( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Copyt_external( FLA_Trans trans, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Dot_external( FLA_Obj x, FLA_Obj y, FLA_Obj rho );
FLA_Error FLA_Dotc_external( FLA_Conj conj, FLA_Obj x, FLA_Obj y, FLA_Obj rho );
FLA_Error FLA_Dots_external( FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj beta, FLA_Obj rho );
FLA_Error FLA_Dotcs_external( FLA_Conj conj, FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj beta, FLA_Obj rho );
FLA_Error FLA_Dot2s_external( FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj beta, FLA_Obj rho );
FLA_Error FLA_Dot2cs_external( FLA_Conj conj, FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj beta, FLA_Obj rho );
FLA_Error FLA_Amax_external( FLA_Obj x, FLA_Obj index );
FLA_Error FLA_Inv_scal_external( FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Inv_scalc_external( FLA_Conj conjalpha, FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Nrm2_external( FLA_Obj x, FLA_Obj nrm_x );
FLA_Error FLA_Scal_external( FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Scalc_external( FLA_Conj conjalpha, FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Scalr_external( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Swap_external( FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Swapt_external( FLA_Trans trans, FLA_Obj A, FLA_Obj B );


// --- gpu wrapper prototypes --------------------------------------------------

FLA_Error FLA_Axpy_external_gpu( FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu );
FLA_Error FLA_Copy_external_gpu( FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu );
FLA_Error FLA_Scal_external_gpu( FLA_Obj alpha, FLA_Obj A, void* A_gpu );
FLA_Error FLA_Scalr_external_gpu( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, void* A_gpu );


// --- hip wrapper prototypes --------------------------------------------------
#ifdef FLA_ENABLE_HIP
FLA_Error FLA_Axpy_external_hip( rocblas_handle handle, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu );
FLA_Error FLA_Copy_external_hip( rocblas_handle handle, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu );
FLA_Error FLA_Copyr_external_hip( rocblas_handle handle, FLA_Uplo uplo, FLA_Obj A, void* A_hip, FLA_Obj B, void* B_hip );
FLA_Error FLA_Scal_external_hip( rocblas_handle handle, FLA_Obj alpha, FLA_Obj A, void* A_gpu );
FLA_Error FLA_Scalr_external_hip( rocblas_handle handle, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, void* A_gpu );

FLA_Error FLA_Copyconj_tri_external_hip( rocblas_handle handle, FLA_Uplo uplo, FLA_Obj A, void* A_hip, void* B_mat );
FLA_Error FLA_Copyconj_general_external_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, void* B_mat );
#endif

// --- check routine prototypes ------------------------------------------------

// front-ends
FLA_Error FLA_Asum_check( FLA_Obj x, FLA_Obj asum_x );
FLA_Error FLA_Axpy_check( FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Axpys_check( FLA_Obj alpha0, FLA_Obj alpha1, FLA_Obj A, FLA_Obj beta, FLA_Obj B );
FLA_Error FLA_Axpyt_check( FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Axpyrt_check( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Copy_check( FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Copyr_check( FLA_Uplo uplo, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Copyrt_check( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Copyt_check( FLA_Trans trans, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Dot_check( FLA_Obj x, FLA_Obj y, FLA_Obj rho );
FLA_Error FLA_Dotc_check( FLA_Conj conj, FLA_Obj x, FLA_Obj y, FLA_Obj rho );
FLA_Error FLA_Dots_check( FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj beta, FLA_Obj rho );
FLA_Error FLA_Dotcs_check( FLA_Conj conj, FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj beta, FLA_Obj rho );
FLA_Error FLA_Dot2s_check( FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj beta, FLA_Obj rho );
FLA_Error FLA_Dot2cs_check( FLA_Conj conj, FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj beta, FLA_Obj rho );
FLA_Error FLA_Amax_check( FLA_Obj x, FLA_Obj index );
FLA_Error FLA_Inv_scal_check( FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Inv_scalc_check( FLA_Conj conjalpha, FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Nrm2_check( FLA_Obj x, FLA_Obj nrm_x );
FLA_Error FLA_Scal_check( FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Scalc_check( FLA_Conj conjalpha, FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Scalr_check( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Swap_check( FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Swapt_check( FLA_Trans trans, FLA_Obj A, FLA_Obj B );

// internal back-ends
FLA_Error FLA_Axpy_internal_check( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpy_t* cntl );
FLA_Error FLA_Axpyt_internal_check( FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Copy_internal_check( FLA_Obj A, FLA_Obj B, fla_copy_t* cntl );
FLA_Error FLA_Copyt_internal_check( FLA_Trans trans, FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyr_internal_check( FLA_Uplo uplo, FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl );
FLA_Error FLA_Scal_internal_check( FLA_Obj alpha, FLA_Obj A, fla_scal_t* cntl );
FLA_Error FLA_Scalr_internal_check( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl );

// end FLA_blas1_prototypes.h
// begin FLA_blas2_prototypes.h


#ifdef FLA_ENABLE_HIP
#include <rocblas/rocblas.h> // skipped
#endif

// --- top-level wrapper prototypes --------------------------------------------

FLA_Error FLA_Gemv( FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );
FLA_Error FLA_Gemvc( FLA_Trans transa, FLA_Conj conjx, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );
FLA_Error FLA_Ger( FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj A );
FLA_Error FLA_Gerc( FLA_Conj conjx, FLA_Conj conjy, FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj A );
FLA_Error FLA_Hemv( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );
FLA_Error FLA_Hemvc( FLA_Uplo uplo, FLA_Conj conja, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );
FLA_Error FLA_Her( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj x, FLA_Obj A );
FLA_Error FLA_Herc( FLA_Uplo uplo, FLA_Conj conj, FLA_Obj alpha, FLA_Obj x, FLA_Obj A );
FLA_Error FLA_Her2( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj A );
FLA_Error FLA_Her2c( FLA_Uplo uplo, FLA_Conj conj, FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj A );
FLA_Error FLA_Symv( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );
FLA_Error FLA_Syr( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj x, FLA_Obj A );
FLA_Error FLA_Syr2( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj A );
FLA_Error FLA_Trmv( FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj A, FLA_Obj x );
FLA_Error FLA_Trmvsx( FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );
FLA_Error FLA_Trsv( FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj A, FLA_Obj x );
FLA_Error FLA_Trsvsx( FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );


// --- task wrapper prototypes -------------------------------------------------

FLA_Error FLA_Gemv_task( FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );
FLA_Error FLA_Trsv_task( FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );

FLA_Error FLA_Gemv_h_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );
FLA_Error FLA_Gemv_n_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );
FLA_Error FLA_Gemv_t_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );

FLA_Error FLA_Trsv_lc_task( FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );
FLA_Error FLA_Trsv_ln_task( FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );
FLA_Error FLA_Trsv_lt_task( FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );
FLA_Error FLA_Trsv_uc_task( FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );
FLA_Error FLA_Trsv_un_task( FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );
FLA_Error FLA_Trsv_ut_task( FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );


// --- external wrapper prototypes ---------------------------------------------

FLA_Error FLA_Gemv_external( FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );
FLA_Error FLA_Gemvc_external( FLA_Trans transa, FLA_Conj conjx, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );
FLA_Error FLA_Ger_external( FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj A );
FLA_Error FLA_Gerc_external( FLA_Conj conjx, FLA_Conj conjy, FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj A );
FLA_Error FLA_Hemv_external( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );
FLA_Error FLA_Hemvc_external( FLA_Uplo uplo, FLA_Conj conja, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );
FLA_Error FLA_Her_external( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj x, FLA_Obj A );
FLA_Error FLA_Herc_external( FLA_Uplo uplo, FLA_Conj conj, FLA_Obj alpha, FLA_Obj x, FLA_Obj A );
FLA_Error FLA_Her2_external( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj A );
FLA_Error FLA_Her2c_external( FLA_Uplo uplo, FLA_Conj conj, FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj A );
FLA_Error FLA_Symv_external( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );
FLA_Error FLA_Syr_external( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj x, FLA_Obj A );
FLA_Error FLA_Syr2_external( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj A );
FLA_Error FLA_Trmv_external( FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj A, FLA_Obj x );
FLA_Error FLA_Trmvsx_external( FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );
FLA_Error FLA_Trsv_external( FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj A, FLA_Obj x );
FLA_Error FLA_Trsvsx_external( FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );


// --- gpu wrapper prototypes --------------------------------------------------

FLA_Error FLA_Gemv_external_gpu( FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj x, void* x_gpu, FLA_Obj beta, FLA_Obj y, void* y_gpu );
FLA_Error FLA_Trsv_external_gpu( FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj A, void* A_gpu, FLA_Obj x, void* x_gpu );


// --- hip wrapper prototypes --------------------------------------------------
#ifdef FLA_ENABLE_HIP
FLA_Error FLA_Gemv_external_hip( rocblas_handle handle, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj x, void* x_gpu, FLA_Obj beta, FLA_Obj y, void* y_gpu );
FLA_Error FLA_Trsv_external_hip( rocblas_handle handle, FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj A, void* A_gpu, FLA_Obj x, void* x_gpu );
#endif

// --- check routine prototypes ------------------------------------------------

// front-ends
FLA_Error FLA_Gemv_check( FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );
FLA_Error FLA_Gemvc_check( FLA_Trans transa, FLA_Conj conjx, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );
FLA_Error FLA_Ger_check( FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj A );
FLA_Error FLA_Gerc_check( FLA_Conj conjx, FLA_Conj conjy, FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj A );
FLA_Error FLA_Hemv_check( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );
FLA_Error FLA_Hemvc_check( FLA_Uplo uplo, FLA_Conj conja, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );
FLA_Error FLA_Her_check( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj x, FLA_Obj A );
FLA_Error FLA_Herc_check( FLA_Uplo uplo, FLA_Conj conj, FLA_Obj alpha, FLA_Obj x, FLA_Obj A );
FLA_Error FLA_Her2_check( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj A );
FLA_Error FLA_Her2c_check( FLA_Uplo uplo, FLA_Conj conj, FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj A );
FLA_Error FLA_Symv_check( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );
FLA_Error FLA_Syr_check( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj x, FLA_Obj A );
FLA_Error FLA_Syr2_check( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj x, FLA_Obj y, FLA_Obj A );
FLA_Error FLA_Trmv_check( FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj A, FLA_Obj x );
FLA_Error FLA_Trmvsx_check( FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );
FLA_Error FLA_Trsv_check( FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj A, FLA_Obj x );
FLA_Error FLA_Trsvsx_check( FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );

// internal back-ends
FLA_Error FLA_Gemv_internal_check( FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );
FLA_Error FLA_Trsv_internal_check( FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );

// end FLA_blas2_prototypes.h
// begin FLA_blas3_prototypes.h


#ifdef FLA_ENABLE_HIP
#include <rocblas/rocblas.h> // skipped
#endif

// --- top-level wrapper prototypes --------------------------------------------

FLA_Error FLA_Gemm( FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm( FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm( FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Trmm( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmmsx( FLA_Side side, FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Trsm( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsmsx( FLA_Side side, FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

FLA_Error FLA_Gemp( FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gepm( FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gepp( FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );


// --- task wrapper prototypes -------------------------------------------------

FLA_Error FLA_Gemm_task( FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Hemm_task( FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Herk_task( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Her2k_task( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Symm_task( FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Syrk_task( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syr2k_task( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Trmm_task( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trsm_task( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );

FLA_Error FLA_Gemm_cc_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ch_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_cn_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ct_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hc_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hh_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hn_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ht_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nc_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nh_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nn_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nt_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tc_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_th_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tn_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tt_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );

FLA_Error FLA_Hemm_ll_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_lu_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_rl_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ru_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );

FLA_Error FLA_Her2k_ln_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_lh_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_un_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_uh_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );

FLA_Error FLA_Herk_ln_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_lh_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_un_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_uh_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );

FLA_Error FLA_Symm_ll_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_lu_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_rl_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ru_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );

FLA_Error FLA_Syr2k_ln_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_lt_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_un_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ut_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );

FLA_Error FLA_Syrk_ln_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_lt_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_un_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_ut_task( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );

FLA_Error FLA_Trmm_llc_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_llh_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_lln_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_llt_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_luc_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_luh_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_lun_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_lut_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rlc_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rlh_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rln_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rlt_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_ruc_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_ruh_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_run_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rut_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );

FLA_Error FLA_Trsm_llc_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_llh_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_lln_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_llt_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_luc_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_luh_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_lun_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_lut_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rlc_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rlh_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rln_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rlt_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_ruc_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_ruh_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_run_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rut_task( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );


// --- external wrapper prototypes ---------------------------------------------

FLA_Error FLA_Gemm_external( FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_external( FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_external( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_external( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_external( FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_external( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_external( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Trmm_external( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_external( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

FLA_Error FLA_Trmmsx_external( FLA_Side side, FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Trsmsx_external( FLA_Side side, FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );


// --- gpu wrapper prototypes --------------------------------------------------

FLA_Error FLA_Gemm_external_gpu( FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu );
FLA_Error FLA_Hemm_external_gpu( FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu );
FLA_Error FLA_Herk_external_gpu( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu );
FLA_Error FLA_Her2k_external_gpu( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu );
FLA_Error FLA_Symm_external_gpu( FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu );
FLA_Error FLA_Syrk_external_gpu( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu );
FLA_Error FLA_Syr2k_external_gpu( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu );
FLA_Error FLA_Trmm_external_gpu( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu );
FLA_Error FLA_Trsm_external_gpu( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu );


// --- hip wrapper prototypes --------------------------------------------------
#ifdef FLA_ENABLE_HIP
FLA_Error FLA_Gemm_external_hip( rocblas_handle handle, FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu );
FLA_Error FLA_Hemm_external_hip( rocblas_handle handle, FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu );
FLA_Error FLA_Herk_external_hip( rocblas_handle handle, FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu );
FLA_Error FLA_Her2k_external_hip( rocblas_handle handle, FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu );
FLA_Error FLA_Symm_external_hip( rocblas_handle handle, FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu );
FLA_Error FLA_Syrk_external_hip( rocblas_handle handle, FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu );
FLA_Error FLA_Syr2k_external_hip( rocblas_handle handle, FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu, FLA_Obj beta, FLA_Obj C, void* C_gpu );
FLA_Error FLA_Trmm_external_hip( rocblas_handle handle, FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu );
FLA_Error FLA_Trsm_external_hip( rocblas_handle handle, FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, void* A_gpu, FLA_Obj B, void* B_gpu );
#endif

// --- check routine prototypes ------------------------------------------------

// front-ends
FLA_Error FLA_Gemm_check( FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj  beta, FLA_Obj C );
FLA_Error FLA_Hemm_check( FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta,  FLA_Obj C );
FLA_Error FLA_Her2k_check( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta,  FLA_Obj C );
FLA_Error FLA_Herk_check( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta,  FLA_Obj C );
FLA_Error FLA_Symm_check( FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_check( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta,  FLA_Obj C );
FLA_Error FLA_Syrk_check( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta,  FLA_Obj C );
FLA_Error FLA_Trmm_check( FLA_Side side, FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmmsx_check( FLA_Side side, FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Trsm_check( FLA_Side side, FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsmsx_check( FLA_Side side, FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// internal back-ends
FLA_Error FLA_Gemm_internal_check( FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Hemm_internal_check( FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Herk_internal_check( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Her2k_internal_check( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Symm_internal_check( FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Syrk_internal_check( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syr2k_internal_check( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Trmm_internal_check( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trsm_internal_check( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );

// end FLA_blas3_prototypes.h
// begin FLA_lapack_prototypes.h


// --- top-level wrapper prototypes --------------------------------------------

FLA_Error FLA_Chol( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_LU_nopiv( FLA_Obj A );
FLA_Error FLA_LU_piv( FLA_Obj A, FLA_Obj p );
FLA_Error FLA_QR_UT( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_QR_UT_piv( FLA_Obj A, FLA_Obj T, FLA_Obj w, FLA_Obj p );
FLA_Error FLA_LQ_UT( FLA_Obj A, FLA_Obj S );
FLA_Error FLA_Trinv( FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A );
FLA_Error FLA_Ttmm( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Sylv( FLA_Trans transa, FLA_Trans transb, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_SPDinv( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Hess_UT( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Eig_gest( FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj B );

FLA_Error FLA_Accum_T_UT( FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj tau, FLA_Obj T );
FLA_Error FLA_Apply_H2_UT( FLA_Side side, FLA_Obj tau, FLA_Obj u2, FLA_Obj a1, FLA_Obj A2 );
FLA_Error FLA_Apply_HUD_UT( FLA_Side side, FLA_Obj tau, FLA_Obj w12t, FLA_Obj u2, FLA_Obj v2, FLA_Obj r12t, FLA_Obj C2, FLA_Obj D2 );
FLA_Error FLA_Apply_Q_UT( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B );
FLA_Error FLA_Apply_pivots( FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A );

// --- task wrapper prototypes -------------------------------------------------

FLA_Error FLA_Chol_task( FLA_Uplo uplo, FLA_Obj A, fla_chol_t* cntl );
FLA_Error FLA_Chol_l_task( FLA_Obj A, fla_chol_t* cntl );
FLA_Error FLA_Chol_u_task( FLA_Obj A, fla_chol_t* cntl );
FLA_Error FLA_LU_piv_macro_task( FLA_Obj A, FLA_Obj p, fla_lu_t* cntl );
FLA_Error FLA_Apply_pivots_task( FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A, fla_appiv_t* cntl );
FLA_Error FLA_Apply_pivots_ln_task( FLA_Obj p, FLA_Obj A, fla_appiv_t* cntl );
FLA_Error FLA_Apply_pivots_macro_task( FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A, fla_appiv_t* cntl );
FLA_Error FLA_LU_nopiv_task( FLA_Obj A, fla_lu_t* cntl );
FLA_Error FLA_LU_piv_task( FLA_Obj A, FLA_Obj p, fla_lu_t* cntl );
FLA_Error FLA_LU_piv_copy_task( FLA_Obj A, FLA_Obj p, FLA_Obj U, fla_lu_t* cntl );
FLA_Error FLA_Trsm_piv_task( FLA_Obj A, FLA_Obj B, FLA_Obj p, fla_trsm_t* cntl );
FLA_Error FLA_SA_LU_task( FLA_Obj U, FLA_Obj D, FLA_Obj p, FLA_Obj L, dim_t nb_alg, fla_lu_t* cntl );
FLA_Error FLA_SA_FS_task( FLA_Obj L, FLA_Obj D, FLA_Obj p, FLA_Obj C, FLA_Obj E, dim_t nb_alg, fla_gemm_t* cntl );
FLA_Error FLA_Trinv_task( FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Trinv_ln_task( FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Trinv_lu_task( FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Trinv_un_task( FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Trinv_uu_task( FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Ttmm_task( FLA_Uplo uplo, FLA_Obj A, fla_ttmm_t* cntl );
FLA_Error FLA_Ttmm_l_task( FLA_Obj A, fla_ttmm_t* cntl );
FLA_Error FLA_Ttmm_u_task( FLA_Obj A, fla_ttmm_t* cntl );
FLA_Error FLA_Sylv_task( FLA_Trans transa, FLA_Trans transb, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nn_task( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nh_task( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hn_task( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hh_task( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Lyap_task( FLA_Trans trans, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl );
FLA_Error FLA_Lyap_n_task( FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl );
FLA_Error FLA_Lyap_h_task( FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl );
FLA_Error FLA_Apply_Q_UT_task( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lhbc_task( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lhbr_task( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lhfc_task( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lhfr_task( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lnbc_task( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lnbr_task( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lnfc_task( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lnfr_task( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rhbc_task( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rhbr_task( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rhfc_task( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rhfr_task( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rnbc_task( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rnbr_task( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rnfc_task( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rnfr_task( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q2_UT_task( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apq2ut_t* cntl );
FLA_Error FLA_Apply_Q2_UT_lhfc_task( FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apq2ut_t* cntl );
FLA_Error FLA_Apply_CAQ2_UT_task( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apcaq2ut_t* cntl );
FLA_Error FLA_Apply_CAQ2_UT_lhfc_task( FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apcaq2ut_t* cntl );
FLA_Error FLA_QR2_UT_task( FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_qr2ut_t* cntl );
FLA_Error FLA_CAQR2_UT_task( FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_caqr2ut_t* cntl );
FLA_Error FLA_QR_UT_macro_task( FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl );
FLA_Error FLA_QR_UT_task( FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl );
FLA_Error FLA_QR_UT_copy_task( FLA_Obj A, FLA_Obj T, FLA_Obj U, fla_qrut_t* cntl );
FLA_Error FLA_LQ_UT_macro_task( FLA_Obj A, FLA_Obj T, fla_lqut_t* cntl );
FLA_Error FLA_LQ_UT_task( FLA_Obj A, FLA_Obj T, fla_lqut_t* cntl );
FLA_Error FLA_UDdate_UT_task( FLA_Obj R, FLA_Obj C, FLA_Obj D, FLA_Obj T, fla_uddateut_t* cntl );
FLA_Error FLA_Apply_QUD_UT_task( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj T, FLA_Obj W, FLA_Obj R, FLA_Obj U, FLA_Obj C, FLA_Obj V, FLA_Obj D, fla_apqudut_t* cntl );
FLA_Error FLA_Apply_QUD_UT_lhfc_task( FLA_Obj T, FLA_Obj W, FLA_Obj R, FLA_Obj U, FLA_Obj C, FLA_Obj V, FLA_Obj D, fla_apqudut_t* cntl );
FLA_Error FLA_Eig_gest_task( FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_il_task( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_iu_task( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_nl_task( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_nu_task( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );

// --- external wrapper prototypes ---------------------------------------------

FLA_Error FLA_Apply_Q_blk_external( FLA_Side side, FLA_Trans trans, FLA_Store storev, FLA_Obj A, FLA_Obj t, FLA_Obj B );

FLA_Error FLA_Apply_pivots_unb_external( FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A );
FLA_Error FLA_Apply_pivots_ln_unb_ext( FLA_Obj p, FLA_Obj A );

FLA_Error FLA_Apply_pivots_macro_external( FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A );

FLA_Error FLA_Chol_blk_external( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Chol_l_blk_ext( FLA_Obj A );
FLA_Error FLA_Chol_u_blk_ext( FLA_Obj A );
FLA_Error FLA_Chol_unb_external( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Chol_l_unb_ext( FLA_Obj A );
FLA_Error FLA_Chol_u_unb_ext( FLA_Obj A );

FLA_Error FLA_LU_piv_blk_external( FLA_Obj A, FLA_Obj p );
FLA_Error FLA_LU_piv_blk_ext( FLA_Obj A, FLA_Obj p );
FLA_Error FLA_LU_piv_unb_external( FLA_Obj A, FLA_Obj p );
FLA_Error FLA_LU_piv_unb_ext( FLA_Obj A, FLA_Obj p );

FLA_Error FLA_QR_blk_external( FLA_Obj A, FLA_Obj t );
FLA_Error FLA_QR_unb_external( FLA_Obj A, FLA_Obj t );

FLA_Error FLA_LQ_blk_external( FLA_Obj A, FLA_Obj t );
FLA_Error FLA_LQ_unb_external( FLA_Obj A, FLA_Obj t );

FLA_Error FLA_Hess_blk_external( FLA_Obj A, FLA_Obj t, int ilo, int ihi );
FLA_Error FLA_Hess_unb_external( FLA_Obj A, FLA_Obj t, int ilo, int ihi );

FLA_Error FLA_Tridiag_blk_external( FLA_Uplo uplo, FLA_Obj A, FLA_Obj t );
FLA_Error FLA_Tridiag_unb_external( FLA_Uplo uplo, FLA_Obj A, FLA_Obj t );

FLA_Error FLA_Bidiag_blk_external( FLA_Obj A, FLA_Obj tu, FLA_Obj tv );
FLA_Error FLA_Bidiag_unb_external( FLA_Obj A, FLA_Obj tu, FLA_Obj tv );

FLA_Error FLA_QR_form_Q_external( FLA_Obj A, FLA_Obj t );

FLA_Error FLA_Tridiag_form_Q_external( FLA_Uplo uplo, FLA_Obj A, FLA_Obj t );
FLA_Error FLA_Tridiag_apply_Q_external( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Obj A, FLA_Obj t, FLA_Obj B );

FLA_Error FLA_Bidiag_form_U_external( FLA_Obj A, FLA_Obj t );
FLA_Error FLA_Bidiag_form_V_external( FLA_Obj A, FLA_Obj t );
FLA_Error FLA_Bidiag_apply_U_external( FLA_Side side, FLA_Trans trans, FLA_Obj A, FLA_Obj t, FLA_Obj B );
FLA_Error FLA_Bidiag_apply_V_external( FLA_Side side, FLA_Trans trans, FLA_Obj A, FLA_Obj t, FLA_Obj B );

FLA_Error FLA_Trinv_blk_external( FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A );
FLA_Error FLA_Trinv_ln_blk_ext( FLA_Obj A );
FLA_Error FLA_Trinv_lu_blk_ext( FLA_Obj A );
FLA_Error FLA_Trinv_un_blk_ext( FLA_Obj A );
FLA_Error FLA_Trinv_uu_blk_ext( FLA_Obj A );
FLA_Error FLA_Trinv_unb_external( FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A );
FLA_Error FLA_Trinv_ln_unb_ext( FLA_Obj A );
FLA_Error FLA_Trinv_lu_unb_ext( FLA_Obj A );
FLA_Error FLA_Trinv_un_unb_ext( FLA_Obj A );
FLA_Error FLA_Trinv_uu_unb_ext( FLA_Obj A );

FLA_Error FLA_Ttmm_blk_external( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Ttmm_l_blk_ext( FLA_Obj A );
FLA_Error FLA_Ttmm_u_blk_ext( FLA_Obj A );
FLA_Error FLA_Ttmm_unb_external( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Ttmm_l_unb_ext( FLA_Obj A );
FLA_Error FLA_Ttmm_u_unb_ext( FLA_Obj A );

FLA_Error FLA_Sylv_blk_external( FLA_Trans transa, FLA_Trans transb, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nn_blk_ext( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nh_blk_ext( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hn_blk_ext( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hh_blk_ext( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_unb_external( FLA_Trans transa, FLA_Trans transb, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nn_unb_ext( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nh_unb_ext( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hn_unb_ext( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hh_unb_ext( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );

FLA_Error FLA_SPDinv_blk_external( FLA_Uplo uplo, FLA_Obj A );

FLA_Error FLA_Eig_gest_blk_external( FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Eig_gest_il_blk_ext( FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Eig_gest_iu_blk_ext( FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Eig_gest_nl_blk_ext( FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Eig_gest_nu_blk_ext( FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Eig_gest_unb_external( FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Eig_gest_il_unb_ext( FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Eig_gest_iu_unb_ext( FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Eig_gest_nl_unb_ext( FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Eig_gest_nu_unb_ext( FLA_Obj A, FLA_Obj B );

FLA_Error FLA_Tevd_external( FLA_Evd_type jobz, FLA_Obj d, FLA_Obj e, FLA_Obj A );
FLA_Error FLA_Tevdd_external( FLA_Evd_type jobz, FLA_Obj d, FLA_Obj e, FLA_Obj A );
FLA_Error FLA_Tevdr_external( FLA_Evd_type jobz, FLA_Obj d, FLA_Obj e, FLA_Obj l, FLA_Obj A );
FLA_Error FLA_Hevd_external( FLA_Evd_type jobz, FLA_Uplo uplo, FLA_Obj A, FLA_Obj l );
FLA_Error FLA_Hevdd_external( FLA_Evd_type jobz, FLA_Uplo uplo, FLA_Obj A, FLA_Obj l );
FLA_Error FLA_Hevdr_external( FLA_Evd_type jobz, FLA_Uplo uplo, FLA_Obj A, FLA_Obj l, FLA_Obj Z );
FLA_Error FLA_Bsvd_external( FLA_Uplo uplo, FLA_Obj d, FLA_Obj e, FLA_Obj U, FLA_Obj V );
FLA_Error FLA_Bsvdd_external( FLA_Uplo uplo, FLA_Obj d, FLA_Obj e, FLA_Obj U, FLA_Obj V );
FLA_Error FLA_Svd_external( FLA_Svd_type jobu, FLA_Svd_type jobv, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V );
FLA_Error FLA_Svdd_external( FLA_Svd_type jobz, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V );

// --- external HIP prototypes -------------------------------------------------
#ifdef FLA_ENABLE_HIP
FLA_Error FLA_Apply_pivots_unb_external_hip( rocblas_handle handle, FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A, void* A_hip );
FLA_Error FLA_Apply_pivots_ln_unb_ext_hip( rocblas_handle handle, FLA_Obj p, FLA_Obj A, void* A_hip );
FLA_Error FLA_Apply_Q_blk_external_hip( rocblas_handle handle, FLA_Side side, FLA_Trans trans, FLA_Store storev, FLA_Obj A, void* A_hip, FLA_Obj t, void* t_hip, FLA_Obj B, void* B_hip );
FLA_Error FLA_Bidiag_apply_U_external_hip( rocblas_handle handle, FLA_Side side, FLA_Trans trans, FLA_Obj A, void* A_hip, FLA_Obj t, void* t_hip, FLA_Obj B, void* B_hip );
FLA_Error FLA_Bidiag_apply_V_external_hip( rocblas_handle handle, FLA_Side side, FLA_Trans trans, FLA_Obj A, void* A_hip, FLA_Obj t, void* t_hip, FLA_Obj B, void* B_hip );
FLA_Error FLA_Bidiag_blk_external_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj tu, void* tu_hip, FLA_Obj tv, void* tv_hip );
FLA_Error FLA_Bidiag_blk_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj tu, void* tu_hip, FLA_Obj tv, void* tv_hip );
FLA_Error FLA_Bidiag_unb_external_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj tu, void* tu_hip, FLA_Obj tv, void* tv_hip );
FLA_Error FLA_Bidiag_unb_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj tu, void* tu_hip, FLA_Obj tv, void* tv_hip );
FLA_Error FLA_Bidiag_form_U_external_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj t, void* t_hip );
FLA_Error FLA_Bidiag_form_V_external_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj t, void* t_hip );
FLA_Error FLA_Bsvd_external_hip( rocblas_handle handle, FLA_Uplo uplo, FLA_Obj d, void* d_hip, FLA_Obj e, void* e_hip, FLA_Obj U, void* U_hip, FLA_Obj V, void* V_hip );
FLA_Error FLA_Chol_blk_external_hip( rocblas_handle handle, FLA_Uplo uplo, FLA_Obj A, void* A_hip );
FLA_Error FLA_Chol_l_blk_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip );
FLA_Error FLA_Chol_u_blk_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip );
FLA_Error FLA_Chol_unb_external_hip( rocblas_handle handle, FLA_Uplo uplo, FLA_Obj A, void* A_hip );
FLA_Error FLA_Chol_l_unb_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip );
FLA_Error FLA_Chol_u_unb_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip );
FLA_Error FLA_Eig_gest_blk_external_hip( rocblas_handle handle, FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, void* A_hip, FLA_Obj B, void* B_hip );
FLA_Error FLA_Eig_gest_il_blk_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj B, void* B_hip );
FLA_Error FLA_Eig_gest_iu_blk_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj B, void* B_hip );
FLA_Error FLA_Eig_gest_nl_blk_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj B, void* B_hip );
FLA_Error FLA_Eig_gest_nu_blk_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj B, void* B_hip );
FLA_Error FLA_Eig_gest_unb_external_hip( rocblas_handle handle, FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, void* A_hip, FLA_Obj B, void* B_hip );
FLA_Error FLA_Eig_gest_il_unb_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj B, void* B_hip );
FLA_Error FLA_Eig_gest_iu_unb_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj B, void* B_hip );
FLA_Error FLA_Eig_gest_nl_unb_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj B, void* B_hip );
FLA_Error FLA_Eig_gest_nu_unb_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj B, void* B_hip );
FLA_Error FLA_Hevdd_external_hip( rocblas_handle handle, FLA_Evd_type jobz, FLA_Uplo uplo, FLA_Obj A, void* A_hip, FLA_Obj e, void* e_hip );
FLA_Error FLA_Hevd_external_hip( rocblas_handle handle, FLA_Evd_type jobz, FLA_Uplo uplo, FLA_Obj A, void* A_hip, FLA_Obj e, void* e_hip );
FLA_Error FLA_LU_piv_blk_external_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj p );
FLA_Error FLA_LU_piv_copy_external_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj p, FLA_Obj U, void* U_hip );
FLA_Error FLA_LQ_blk_external_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj t, void* t_hip );
FLA_Error FLA_LQ_blk_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj t, void* t_hip );
FLA_Error FLA_LQ_unb_external_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj t, void* t_hip );
FLA_Error FLA_LQ_unb_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj t, void* t_hip );
FLA_Error FLA_QR_form_Q_external_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj t, void* t_hip );
FLA_Error FLA_QR_unb_external_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj t, void* t_hip );
FLA_Error FLA_QR_unb_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj t, void* t_hip );
FLA_Error FLA_SA_Apply_pivots_hip( rocblas_handle handle, FLA_Obj C, void* C_hip, FLA_Obj E, void* E_hip, FLA_Obj p );
FLA_Error FLA_SA_FS_blk_hip( rocblas_handle handle, FLA_Obj L, FLA_Obj D, void* D_hip, FLA_Obj p, FLA_Obj C, void* C_hip,FLA_Obj E, void* E_hip, dim_t nb_alg );
FLA_Error FLA_Svd_external_hip( rocblas_handle handle, FLA_Svd_type jobu, FLA_Svd_type jobv, FLA_Obj A, void* A_hip, FLA_Obj s, void* s_hip, FLA_Obj U, void* U_hip, FLA_Obj V, void* V_hip );
FLA_Error FLA_Tevdd_external_hip( rocblas_handle handle, FLA_Evd_type jobz, FLA_Obj d, void* d_hip, FLA_Obj e, void* e_hip, FLA_Obj A, void* A_hip );
FLA_Error FLA_Tevd_external_hip( rocblas_handle handle, FLA_Evd_type jobz, FLA_Obj d, void* d_hip, FLA_Obj e, void* e_hip, FLA_Obj A, void* A_hip );
FLA_Error FLA_Tridiag_apply_Q_external_hip( rocblas_handle handle, FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Obj A, void* A_hip, FLA_Obj t, void* t_hip, FLA_Obj B, void* B_hip );
FLA_Error FLA_Tridiag_blk_external_hip( rocblas_handle handle, FLA_Uplo uplo, FLA_Obj A, void* a_hip, FLA_Obj t, void* t_hip );
FLA_Error FLA_Tridiag_blk_ext_hip( rocblas_handle handle, FLA_Uplo uplo, FLA_Obj A, void* a_hip, FLA_Obj t, void* t_hip );
FLA_Error FLA_Tridiag_form_Q_external_hip( rocblas_handle handle, FLA_Uplo uplo, FLA_Obj A, void* A_hip, FLA_Obj t, void* t_hip );
FLA_Error FLA_Tridiag_unb_external_hip( rocblas_handle handle, FLA_Uplo uplo, FLA_Obj A, void* A_hip, FLA_Obj t, void* t_hip );
FLA_Error FLA_Tridiag_unb_ext_hip( rocblas_handle handle, FLA_Uplo uplo, FLA_Obj A, void* A_hip, FLA_Obj t, void* t_hip );
FLA_Error FLA_Trinv_blk_external_hip( rocblas_handle handle, FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A, void* A_hip );
FLA_Error FLA_Trinv_ln_blk_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip );
FLA_Error FLA_Trinv_lu_blk_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip );
FLA_Error FLA_Trinv_un_blk_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip );
FLA_Error FLA_Trinv_uu_blk_ext_hip( rocblas_handle handle, FLA_Obj A, void* A_hip );
FLA_Error FLA_Trsm_piv_external_hip( rocblas_handle handle, FLA_Obj A, void* A_hip, FLA_Obj B, void* B_hip, FLA_Obj p );
#endif

// --- check routine prototypes ------------------------------------------------

FLA_Error FLA_Chol_check( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Chol_solve_check( FLA_Uplo uplo, FLA_Obj A, FLA_Obj B, FLA_Obj X );
FLA_Error FLA_LU_nopiv_check( FLA_Obj A );
FLA_Error FLA_LU_nopiv_solve_check( FLA_Obj A, FLA_Obj B, FLA_Obj X );
FLA_Error FLA_LU_piv_check( FLA_Obj A, FLA_Obj p );
FLA_Error FLA_LU_piv_solve_check( FLA_Obj A, FLA_Obj p, FLA_Obj B, FLA_Obj X );
FLA_Error FLA_LU_incpiv_check( FLA_Obj A, FLA_Obj p, FLA_Obj L );
FLA_Error FLA_LU_incpiv_solve_check( FLA_Obj A, FLA_Obj p, FLA_Obj L, FLA_Obj B, FLA_Obj X );
FLA_Error FLA_FS_incpiv_check( FLA_Obj A, FLA_Obj p, FLA_Obj L, FLA_Obj b );
FLA_Error FLA_QR_check( FLA_Obj A, FLA_Obj t );
FLA_Error FLA_QR_UT_check( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_QR_UT_solve_check( FLA_Obj A, FLA_Obj T, FLA_Obj B, FLA_Obj X );
FLA_Error FLA_QR_UT_recover_tau_check( FLA_Obj T, FLA_Obj tau );
FLA_Error FLA_QR_UT_form_Q_check( FLA_Obj A, FLA_Obj T, FLA_Obj Q );
FLA_Error FLA_LQ_check( FLA_Obj A, FLA_Obj t );
FLA_Error FLA_LQ_UT_check( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_LQ_UT_solve_check( FLA_Obj A, FLA_Obj T, FLA_Obj B, FLA_Obj X );
FLA_Error FLA_LQ_UT_recover_tau_check( FLA_Obj T, FLA_Obj tau );
FLA_Error FLA_LQ_UT_form_Q_check( FLA_Obj A, FLA_Obj T, FLA_Obj Q );
FLA_Error FLA_Hess_check( FLA_Obj A, FLA_Obj t, int ilo, int ihi );
FLA_Error FLA_Hess_UT_check( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_recover_tau_check( FLA_Obj T, FLA_Obj tau );
FLA_Error FLA_Tridiag_check( FLA_Uplo uplo, FLA_Obj A, FLA_Obj t );
FLA_Error FLA_Tridiag_UT_check( FLA_Uplo uplo, FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_recover_tau_check( FLA_Obj T, FLA_Obj tau );
FLA_Error FLA_Tridiag_UT_scale_diagonals_check( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Tridiag_UT_extract_diagonals_check( FLA_Uplo uplo, FLA_Obj A, FLA_Obj d, FLA_Obj e );
FLA_Error FLA_Tridiag_UT_extract_real_diagonals_check( FLA_Uplo uplo, FLA_Obj A, FLA_Obj d, FLA_Obj e );
FLA_Error FLA_Tridiag_UT_realify_check( FLA_Uplo uplo, FLA_Obj A, FLA_Obj d );
FLA_Error FLA_Tridiag_UT_realify_subdiagonal_check( FLA_Obj b, FLA_Obj d );
FLA_Error FLA_Tridiag_UT_shift_U_check( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Tridiag_UT_form_Q_check( FLA_Uplo uplo, FLA_Obj A, FLA_Obj T, FLA_Obj Q );
FLA_Error FLA_Trinv_check( FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A );
FLA_Error FLA_Bidiag_check( FLA_Obj A, FLA_Obj tu, FLA_Obj tv );
FLA_Error FLA_Bidiag_UT_check( FLA_Obj A, FLA_Obj TU, FLA_Obj TV );
FLA_Error FLA_Bidiag_UT_recover_tau_check( FLA_Obj TU, FLA_Obj TV, FLA_Obj tu, FLA_Obj tv );
FLA_Error FLA_Bidiag_UT_extract_diagonals_check( FLA_Obj A, FLA_Obj d, FLA_Obj e );
FLA_Error FLA_Bidiag_UT_extract_real_diagonals_check( FLA_Obj A, FLA_Obj d, FLA_Obj e );
FLA_Error FLA_Bidiag_UT_scale_diagonals_check( FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Bidiag_UT_realify_check( FLA_Obj A, FLA_Obj d, FLA_Obj e );
FLA_Error FLA_Bidiag_UT_realify_diagonals_check( FLA_Uplo uplo, FLA_Obj a, FLA_Obj b, FLA_Obj d, FLA_Obj e );
FLA_Error FLA_Bidiag_UT_form_U_check( FLA_Obj A, FLA_Obj T, FLA_Obj U );
FLA_Error FLA_Bidiag_UT_form_V_check( FLA_Obj A, FLA_Obj S, FLA_Obj V );
FLA_Error FLA_Ttmm_check( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Sylv_check( FLA_Trans transa, FLA_Trans transb, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Lyap_check( FLA_Trans trans, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_SPDinv_check( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Eig_gest_check( FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj B );

FLA_Error FLA_Apply_Q_check( FLA_Side side, FLA_Trans trans, FLA_Store storev, FLA_Obj A, FLA_Obj t, FLA_Obj B );

FLA_Error FLA_QR_form_Q_check( FLA_Obj A, FLA_Obj t );

FLA_Error FLA_Tridiag_form_Q_check( FLA_Uplo uplo, FLA_Obj A, FLA_Obj t );
FLA_Error FLA_Tridiag_apply_Q_check( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Obj A, FLA_Obj t, FLA_Obj B );

FLA_Error FLA_Bidiag_form_U_check( FLA_Obj A, FLA_Obj t );
FLA_Error FLA_Bidiag_form_V_check( FLA_Obj A, FLA_Obj t );
FLA_Error FLA_Bidiag_apply_U_check( FLA_Side side, FLA_Trans trans, FLA_Obj A, FLA_Obj t, FLA_Obj B );
FLA_Error FLA_Bidiag_apply_V_check( FLA_Side side, FLA_Trans trans, FLA_Obj A, FLA_Obj t, FLA_Obj B );

FLA_Error FLA_Apply_Q_UT_check( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B );
FLA_Error FLA_Apply_Q2_UT_check( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E );
FLA_Error FLA_Apply_QUD_UT_check( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj T, FLA_Obj W, FLA_Obj R, FLA_Obj U, FLA_Obj C, FLA_Obj V, FLA_Obj D );
FLA_Error FLA_Apply_pivots_check( FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A );
FLA_Error FLA_QR2_UT_check( FLA_Obj B, FLA_Obj D, FLA_Obj T );
FLA_Error FLA_CAQR2_UT_check( FLA_Obj B, FLA_Obj D, FLA_Obj T );
FLA_Error FLA_QR_UT_inc_check( FLA_Obj A, FLA_Obj TW );
FLA_Error FLA_Apply_Q_UT_inc_check( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj TW, FLA_Obj W1, FLA_Obj B );
FLA_Error FLA_Apply_CAQ_UT_inc_check( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj ATW, FLA_Obj R, FLA_Obj RTW, FLA_Obj W1, FLA_Obj B );

FLA_Error FLA_QR_UT_inc_solve_check( FLA_Obj A, FLA_Obj TW, FLA_Obj B, FLA_Obj X );
FLA_Error FLA_CAQR_UT_inc_solve_check( dim_t p, FLA_Obj A, FLA_Obj ATW, FLA_Obj R, FLA_Obj RTW, FLA_Obj B, FLA_Obj X );

FLA_Error FLA_UDdate_UT_check( FLA_Obj R, FLA_Obj C, FLA_Obj D, FLA_Obj T );
FLA_Error FLA_UDdate_UT_update_rhs_check( FLA_Obj T, FLA_Obj bR, FLA_Obj C, FLA_Obj bC, FLA_Obj D, FLA_Obj bD );
FLA_Error FLA_UDdate_UT_solve_check( FLA_Obj R, FLA_Obj bR, FLA_Obj x );

FLA_Error FLA_UDdate_UT_inc_check( FLA_Obj R, FLA_Obj C, FLA_Obj D, FLA_Obj T, FLA_Obj W );
FLA_Error FLA_UDdate_UT_inc_update_rhs_check( FLA_Obj T, FLA_Obj bR, FLA_Obj C, FLA_Obj bC, FLA_Obj D, FLA_Obj bD );
FLA_Error FLA_UDdate_UT_inc_solve_check( FLA_Obj R, FLA_Obj bR, FLA_Obj x );

FLA_Error FLA_CAQR_UT_inc_check( dim_t p, FLA_Obj A, FLA_Obj ATW, FLA_Obj R, FLA_Obj RTW );

FLA_Error FLA_Apply_QUD_UT_inc_check( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj T, FLA_Obj W, FLA_Obj R, FLA_Obj U, FLA_Obj C, FLA_Obj V, FLA_Obj D );

FLA_Error FLA_Apply_H2_UT_check( FLA_Side side, FLA_Obj tau, FLA_Obj u2, FLA_Obj a1t, FLA_Obj A2 );
FLA_Error FLA_Apply_HUD_UT_check( FLA_Side side, FLA_Obj tau, FLA_Obj w12t, FLA_Obj u2, FLA_Obj v2, FLA_Obj r12t, FLA_Obj C2, FLA_Obj D2 );
FLA_Error FLA_Accum_T_UT_check( FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj tau, FLA_Obj T );

FLA_Error FLA_Tevd_compute_scaling_check( FLA_Obj d, FLA_Obj e, FLA_Obj sigma );
FLA_Error FLA_Hevd_compute_scaling_check( FLA_Uplo uplo, FLA_Obj A, FLA_Obj sigma );
FLA_Error FLA_Hevd_check( FLA_Evd_type jobz, FLA_Uplo uplo, FLA_Obj A, FLA_Obj l );
FLA_Error FLA_Hevdd_check( FLA_Evd_type jobz, FLA_Uplo uplo, FLA_Obj A, FLA_Obj l );
FLA_Error FLA_Hevdr_check( FLA_Evd_type jobz, FLA_Uplo uplo, FLA_Obj A, FLA_Obj l, FLA_Obj Z );

FLA_Error FLA_Bsvd_check( FLA_Uplo uplo, FLA_Obj d, FLA_Obj e,
                          FLA_Obj G, FLA_Obj H,
                          FLA_Svd_type jobu, FLA_Obj U,
                          FLA_Svd_type jobv, FLA_Obj V );
FLA_Error FLA_Bsvd_ext_check( FLA_Uplo uplo, FLA_Obj d, FLA_Obj e,
                              FLA_Obj G, FLA_Obj H,
                              FLA_Svd_type jobu, FLA_Obj U,
                              FLA_Svd_type jobv, FLA_Obj V,
                              FLA_Bool apply_Uh2C, FLA_Obj C );
FLA_Error FLA_Bsvd_compute_scaling_check( FLA_Obj d, FLA_Obj e, FLA_Obj sigma );
FLA_Error FLA_Svd_compute_scaling_check( FLA_Obj A, FLA_Obj sigma );
FLA_Error FLA_Svd_check( FLA_Svd_type jobu, FLA_Svd_type jobv, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V );
FLA_Error FLA_Svd_ext_check( FLA_Svd_type jobu, FLA_Trans transu, FLA_Svd_type jobv, FLA_Trans transv,
                             FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V );
FLA_Error FLA_Svdd_check( FLA_Svd_type jobz, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V );

FLA_Error FLA_Chol_internal_check( FLA_Uplo uplo, FLA_Obj A, fla_chol_t* cntl );
FLA_Error FLA_LU_nopiv_internal_check( FLA_Obj A, fla_lu_t* cntl );
FLA_Error FLA_Trinv_internal_check( FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Ttmm_internal_check( FLA_Uplo uplo, FLA_Obj A, fla_ttmm_t* cntl );
FLA_Error FLA_SPDinv_internal_check( FLA_Uplo uplo, FLA_Obj A, fla_spdinv_t* cntl );
FLA_Error FLA_Sylv_internal_check( FLA_Trans transa, FLA_Trans transb, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Lyap_internal_check( FLA_Trans trans, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl );
FLA_Error FLA_QR_UT_internal_check( FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl );
FLA_Error FLA_QR_UT_copy_internal_check( FLA_Obj A, FLA_Obj T, FLA_Obj U, fla_qrut_t* cntl );
FLA_Error FLA_QR2_UT_internal_check( FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_qr2ut_t* cntl );
FLA_Error FLA_CAQR2_UT_internal_check( FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_caqr2ut_t* cntl );
FLA_Error FLA_LQ_UT_internal_check( FLA_Obj A, FLA_Obj T, fla_lqut_t* cntl );
FLA_Error FLA_Hess_UT_internal_check( FLA_Obj A, FLA_Obj T, fla_hessut_t* cntl );
FLA_Error FLA_Tridiag_UT_internal_check( FLA_Uplo uplo, FLA_Obj A, FLA_Obj T, fla_tridiagut_t* cntl );
FLA_Error FLA_Bidiag_UT_internal_check( FLA_Obj A, FLA_Obj TU, FLA_Obj TV, fla_bidiagut_t* cntl );

FLA_Error FLA_UDdate_UT_internal_check( FLA_Obj R, FLA_Obj C, FLA_Obj D, FLA_Obj T, fla_uddateut_t* cntl );

FLA_Error FLA_Apply_Q_UT_internal_check( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q2_UT_internal_check( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apq2ut_t* cntl );
FLA_Error FLA_Apply_CAQ2_UT_internal_check( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apcaq2ut_t* cntl );
FLA_Error FLA_Apply_QUD_UT_internal_check( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj T, FLA_Obj W, FLA_Obj R, FLA_Obj U, FLA_Obj C, FLA_Obj V, FLA_Obj D, fla_apqudut_t* cntl );

FLA_Error FLA_Apply_Q_UT_inc_internal_check( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj TW, FLA_Obj W1, FLA_Obj B, fla_apqutinc_t* cntl );
FLA_Error FLA_Apply_CAQ_UT_inc_internal_check( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj R, FLA_Obj TW, FLA_Obj W, FLA_Obj B, fla_apcaqutinc_t* cntl );
FLA_Error FLA_Apply_QUD_UT_inc_internal_check( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj T, FLA_Obj W, FLA_Obj R, FLA_Obj U, FLA_Obj C, FLA_Obj V, FLA_Obj D, fla_apqudutinc_t* cntl );

FLA_Error FLA_Eig_gest_internal_check( FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
// end FLA_lapack_prototypes.h

  // Include prototypes for FLAME implementations of BLAS and LAPACK operations.
// begin FLA_blas_var_prototypes.h


// Level-1 BLAS
// begin FLA_Axpy.h


// begin FLA_Axpy_vars.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Axpy_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpy_t* cntl );
FLA_Error FLA_Axpy_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpy_t* cntl );
FLA_Error FLA_Axpy_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpy_t* cntl );
FLA_Error FLA_Axpy_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpy_t* cntl );

// end FLA_Axpy_vars.h

FLA_Error FLA_Axpy_internal( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpy_t* cntl );

// end FLA_Axpy.h
// begin FLA_Axpyt.h


// begin FLA_Axpyt_n.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Axpyt_n_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Axpyt_n_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Axpyt_n_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Axpyt_n_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );

// end FLA_Axpyt_n.h
// begin FLA_Axpyt_t.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Axpyt_t_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Axpyt_t_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Axpyt_t_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Axpyt_t_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );

// end FLA_Axpyt_t.h
// begin FLA_Axpyt_c.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Axpyt_c_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Axpyt_c_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Axpyt_c_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Axpyt_c_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );

// end FLA_Axpyt_c.h
// begin FLA_Axpyt_h.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Axpyt_h_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Axpyt_h_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Axpyt_h_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Axpyt_h_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );

// end FLA_Axpyt_h.h

FLA_Error FLA_Axpyt_internal( FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Axpyt_n( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Axpyt_t( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Axpyt_c( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );
FLA_Error FLA_Axpyt_h( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl );

// end FLA_Axpyt.h
// begin FLA_Copy.h


// begin FLA_Copy_vars.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Copy_blk_var1( FLA_Obj A, FLA_Obj B, fla_copy_t* cntl );
FLA_Error FLA_Copy_blk_var2( FLA_Obj A, FLA_Obj B, fla_copy_t* cntl );
FLA_Error FLA_Copy_blk_var3( FLA_Obj A, FLA_Obj B, fla_copy_t* cntl );
FLA_Error FLA_Copy_blk_var4( FLA_Obj A, FLA_Obj B, fla_copy_t* cntl );

// end FLA_Copy_vars.h

FLA_Error FLA_Copy_internal( FLA_Obj A, FLA_Obj B, fla_copy_t* cntl );

// end FLA_Copy.h
// begin FLA_Copyt.h


// begin FLA_Copyt_n.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Copyt_n_blk_var1( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyt_n_blk_var2( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyt_n_blk_var3( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyt_n_blk_var4( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );

// end FLA_Copyt_n.h
// begin FLA_Copyt_t.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Copyt_t_blk_var1( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyt_t_blk_var2( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyt_t_blk_var3( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyt_t_blk_var4( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );

// end FLA_Copyt_t.h
// begin FLA_Copyt_c.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Copyt_c_blk_var1( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyt_c_blk_var2( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyt_c_blk_var3( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyt_c_blk_var4( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );

// end FLA_Copyt_c.h
// begin FLA_Copyt_h.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Copyt_h_blk_var1( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyt_h_blk_var2( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyt_h_blk_var3( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyt_h_blk_var4( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );

// end FLA_Copyt_h.h

FLA_Error FLA_Copyt_internal( FLA_Trans trans, FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyt_n( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyt_t( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyt_c( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );
FLA_Error FLA_Copyt_h( FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl );

// end FLA_Copyt.h
// begin FLA_Copyr.h


// begin FLA_Copyr_l.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Copyr_l_blk_var1( FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl );
FLA_Error FLA_Copyr_l_blk_var2( FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl );
FLA_Error FLA_Copyr_l_blk_var3( FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl );
FLA_Error FLA_Copyr_l_blk_var4( FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl );

// end FLA_Copyr_l.h
// begin FLA_Copyr_u.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Copyr_u_blk_var1( FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl );
FLA_Error FLA_Copyr_u_blk_var2( FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl );
FLA_Error FLA_Copyr_u_blk_var3( FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl );
FLA_Error FLA_Copyr_u_blk_var4( FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl );

// end FLA_Copyr_u.h

FLA_Error FLASH_Copyr( FLA_Uplo uplo, FLA_Obj A, FLA_Obj B );

FLA_Error FLA_Copyr_internal( FLA_Uplo uplo, FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl );
FLA_Error FLA_Copyr_l( FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl );
FLA_Error FLA_Copyr_u( FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl );

// end FLA_Copyr.h
// begin FLA_Scal.h


// begin FLA_Scal_vars.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Scal_blk_var1( FLA_Obj alpha, FLA_Obj A, fla_scal_t* cntl );
FLA_Error FLA_Scal_blk_var2( FLA_Obj alpha, FLA_Obj A, fla_scal_t* cntl );
FLA_Error FLA_Scal_blk_var3( FLA_Obj alpha, FLA_Obj A, fla_scal_t* cntl );
FLA_Error FLA_Scal_blk_var4( FLA_Obj alpha, FLA_Obj A, fla_scal_t* cntl );

// end FLA_Scal_vars.h

FLA_Error FLA_Scal_internal( FLA_Obj alpha, FLA_Obj A, fla_scal_t* cntl );

// end FLA_Scal.h
// begin FLA_Scalr.h


// begin FLA_Scalr_l.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Scalr_l_blk_var1( FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl );
FLA_Error FLA_Scalr_l_blk_var2( FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl );
FLA_Error FLA_Scalr_l_blk_var3( FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl );
FLA_Error FLA_Scalr_l_blk_var4( FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl );

// end FLA_Scalr_l.h
// begin FLA_Scalr_u.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Scalr_u_blk_var1( FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl );
FLA_Error FLA_Scalr_u_blk_var2( FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl );
FLA_Error FLA_Scalr_u_blk_var3( FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl );
FLA_Error FLA_Scalr_u_blk_var4( FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl );

// end FLA_Scalr_u.h

FLA_Error FLA_Scalr_internal( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl );

FLA_Error FLA_Scalr_l( FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl );
FLA_Error FLA_Scalr_u( FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl );

// end FLA_Scalr.h

// Level-2 BLAS
// begin FLA_Gemv.h


// begin FLA_Gemv_h.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Gemv_h_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );
FLA_Error FLA_Gemv_h_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );
FLA_Error FLA_Gemv_h_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );
FLA_Error FLA_Gemv_h_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );

// end FLA_Gemv_h.h
// begin FLA_Gemv_n.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Gemv_n_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );
FLA_Error FLA_Gemv_n_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );
FLA_Error FLA_Gemv_n_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );
FLA_Error FLA_Gemv_n_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );

// end FLA_Gemv_n.h
// begin FLA_Gemv_t.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Gemv_t_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );
FLA_Error FLA_Gemv_t_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );
FLA_Error FLA_Gemv_t_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );
FLA_Error FLA_Gemv_t_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );

// end FLA_Gemv_t.h

FLA_Error FLA_Gemv_internal( FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );

FLA_Error FLA_Gemv_h( FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );
FLA_Error FLA_Gemv_n( FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );
FLA_Error FLA_Gemv_t( FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl );

// end FLA_Gemv.h
// begin FLA_Trsv.h


// begin FLA_Trsv_lc.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsv_lc_blk_var1( FLA_Diag diagA, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );
FLA_Error FLA_Trsv_lc_blk_var2( FLA_Diag diagA, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );

// end FLA_Trsv_lc.h
// begin FLA_Trsv_ln.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsv_ln_blk_var1( FLA_Diag diagA, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );
FLA_Error FLA_Trsv_ln_blk_var2( FLA_Diag diagA, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );

// end FLA_Trsv_ln.h
// begin FLA_Trsv_lt.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsv_lt_blk_var1( FLA_Diag diagA, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );
FLA_Error FLA_Trsv_lt_blk_var2( FLA_Diag diagA, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );

// end FLA_Trsv_lt.h
// begin FLA_Trsv_uc.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsv_uc_blk_var1( FLA_Diag diagA, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );
FLA_Error FLA_Trsv_uc_blk_var2( FLA_Diag diagA, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );

// end FLA_Trsv_uc.h
// begin FLA_Trsv_un.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsv_un_blk_var1( FLA_Diag diagA, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );
FLA_Error FLA_Trsv_un_blk_var2( FLA_Diag diagA, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );

// end FLA_Trsv_un.h
// begin FLA_Trsv_ut.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsv_ut_blk_var1( FLA_Diag diagA, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );
FLA_Error FLA_Trsv_ut_blk_var2( FLA_Diag diagA, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );

// end FLA_Trsv_ut.h

FLA_Error FLA_Trsv_internal( FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );

FLA_Error FLA_Trsv_lc( FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );
FLA_Error FLA_Trsv_ln( FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );
FLA_Error FLA_Trsv_lt( FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );
FLA_Error FLA_Trsv_uc( FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );
FLA_Error FLA_Trsv_un( FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );
FLA_Error FLA_Trsv_ut( FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl );

// end FLA_Trsv.h

// Level-3 BLAS
// begin FLA_Gemm.h


// begin FLA_Gemm_cc.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Gemm_cc_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_cc_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_cc_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_cc_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_cc_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_cc_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );

FLA_Error FLA_Gemm_cc_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_cc_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_cc_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_cc_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_cc_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_cc_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Gemm_cc.h
// begin FLA_Gemm_ch.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Gemm_ch_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ch_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ch_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ch_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ch_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ch_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );

FLA_Error FLA_Gemm_ch_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_ch_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_ch_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_ch_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_ch_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_ch_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Gemm_ch.h
// begin FLA_Gemm_cn.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Gemm_cn_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_cn_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_cn_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_cn_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_cn_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_cn_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );

FLA_Error FLA_Gemm_cn_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_cn_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_cn_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_cn_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_cn_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_cn_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Gemm_cn.h
// begin FLA_Gemm_ct.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Gemm_ct_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ct_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ct_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ct_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ct_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ct_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );

FLA_Error FLA_Gemm_ct_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_ct_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_ct_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_ct_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_ct_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_ct_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Gemm_ct.h
// begin FLA_Gemm_hc.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Gemm_hc_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hc_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hc_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hc_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hc_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hc_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );

FLA_Error FLA_Gemm_hc_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_hc_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_hc_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_hc_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_hc_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_hc_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Gemm_hc.h
// begin FLA_Gemm_hh.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Gemm_hh_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hh_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hh_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hh_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hh_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hh_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );

FLA_Error FLA_Gemm_hh_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_hh_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_hh_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_hh_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_hh_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_hh_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Gemm_hh.h
// begin FLA_Gemm_hn.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Gemm_hn_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hn_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hn_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hn_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hn_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hn_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );

FLA_Error FLA_Gemm_hn_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_hn_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_hn_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_hn_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_hn_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_hn_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Gemm_hn.h
// begin FLA_Gemm_ht.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Gemm_ht_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ht_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ht_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ht_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ht_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ht_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );

FLA_Error FLA_Gemm_ht_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_ht_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_ht_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_ht_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_ht_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_ht_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Gemm_ht.h
// begin FLA_Gemm_nc.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Gemm_nc_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nc_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nc_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nc_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nc_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nc_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );

FLA_Error FLA_Gemm_nc_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nc_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nc_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nc_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nc_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nc_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Gemm_nc.h
// begin FLA_Gemm_nh.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Gemm_nh_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nh_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nh_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nh_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nh_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nh_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );

FLA_Error FLA_Gemm_nh_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nh_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nh_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nh_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nh_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nh_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Gemm_nh.h
// begin FLA_Gemm_nn.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Gemm_nn_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nn_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nn_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nn_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nn_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nn_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );

FLA_Error FLA_Gemm_nn_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nn_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nn_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nn_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nn_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nn_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Gemm_nn.h
// begin FLA_Gemm_nt.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Gemm_nt_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nt_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nt_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nt_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nt_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nt_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );

FLA_Error FLA_Gemm_nt_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nt_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nt_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nt_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nt_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_nt_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Gemm_nt.h
// begin FLA_Gemm_tc.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Gemm_tc_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tc_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tc_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tc_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tc_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tc_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );

FLA_Error FLA_Gemm_tc_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_tc_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_tc_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_tc_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_tc_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_tc_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Gemm_tc.h
// begin FLA_Gemm_th.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Gemm_th_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_th_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_th_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_th_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_th_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_th_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );

FLA_Error FLA_Gemm_th_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_th_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_th_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_th_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_th_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_th_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Gemm_th.h
// begin FLA_Gemm_tn.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Gemm_tn_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tn_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tn_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tn_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tn_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tn_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );

FLA_Error FLA_Gemm_tn_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_tn_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_tn_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_tn_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_tn_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_tn_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Gemm_tn.h
// begin FLA_Gemm_tt.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Gemm_tt_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tt_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tt_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tt_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tt_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tt_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );

FLA_Error FLA_Gemm_tt_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_tt_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_tt_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_tt_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_tt_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Gemm_tt_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Gemm_tt.h

FLA_Error FLA_Gemm_internal( FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );

FLA_Error FLA_Gemm_cc( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ch( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_cn( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ct( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hc( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hh( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_hn( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_ht( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nc( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nh( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nn( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_nt( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tc( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_th( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tn( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );
FLA_Error FLA_Gemm_tt( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl );

// end FLA_Gemm.h
// begin FLA_Hemm.h


// begin FLA_Hemm_ll.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Hemm_ll_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ll_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ll_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ll_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ll_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ll_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ll_blk_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ll_blk_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ll_blk_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ll_blk_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );

FLA_Error FLA_Hemm_ll_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_ll_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_ll_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_ll_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_ll_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_ll_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_ll_unb_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_ll_unb_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_ll_unb_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_ll_unb_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Hemm_ll.h
// begin FLA_Hemm_lu.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Hemm_lu_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_lu_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_lu_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_lu_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_lu_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_lu_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_lu_blk_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_lu_blk_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_lu_blk_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_lu_blk_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );

FLA_Error FLA_Hemm_lu_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_lu_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_lu_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_lu_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_lu_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_lu_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_lu_unb_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_lu_unb_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_lu_unb_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_lu_unb_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Hemm_lu.h
// begin FLA_Hemm_rl.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Hemm_rl_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_rl_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_rl_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_rl_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_rl_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_rl_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_rl_blk_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_rl_blk_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_rl_blk_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_rl_blk_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );

FLA_Error FLA_Hemm_rl_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_rl_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_rl_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_rl_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_rl_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_rl_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_rl_unb_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_rl_unb_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_rl_unb_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_rl_unb_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Hemm_rl.h
// begin FLA_Hemm_ru.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Hemm_ru_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ru_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ru_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ru_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ru_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ru_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ru_blk_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ru_blk_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ru_blk_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ru_blk_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );

FLA_Error FLA_Hemm_ru_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_ru_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_ru_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_ru_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_ru_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_ru_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_ru_unb_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_ru_unb_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_ru_unb_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Hemm_ru_unb_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Hemm_ru.h

FLA_Error FLA_Hemm_internal( FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );

FLA_Error FLA_Hemm_ll( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_lu( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_rl( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );
FLA_Error FLA_Hemm_ru( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl );

// end FLA_Hemm.h
// begin FLA_Herk.h


// begin FLA_Herk_lh.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Herk_lh_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_lh_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_lh_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_lh_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_lh_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_lh_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );

FLA_Error FLA_Herk_lh_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_lh_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_lh_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_lh_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_lh_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_lh_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );

// end FLA_Herk_lh.h
// begin FLA_Herk_ln.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Herk_ln_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_ln_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_ln_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_ln_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_ln_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_ln_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );

FLA_Error FLA_Herk_ln_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_ln_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_ln_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_ln_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_ln_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_ln_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );

// end FLA_Herk_ln.h
// begin FLA_Herk_uh.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Herk_uh_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_uh_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_uh_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_uh_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_uh_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_uh_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );

FLA_Error FLA_Herk_uh_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_uh_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_uh_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_uh_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_uh_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_uh_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );

// end FLA_Herk_uh.h
// begin FLA_Herk_un.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Herk_un_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_un_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_un_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_un_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_un_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_un_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );

FLA_Error FLA_Herk_un_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_un_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_un_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_un_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_un_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Herk_un_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );

// end FLA_Herk_un.h

FLA_Error FLA_Herk_internal( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );

FLA_Error FLA_Herk_lh( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_ln( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_uh( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );
FLA_Error FLA_Herk_un( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl );

// end FLA_Herk.h
// begin FLA_Her2k.h


// begin FLA_Her2k_lh.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Her2k_lh_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_lh_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_lh_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_lh_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_lh_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_lh_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_lh_blk_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_lh_blk_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_lh_blk_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_lh_blk_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );

FLA_Error FLA_Her2k_lh_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_lh_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_lh_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_lh_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_lh_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_lh_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_lh_unb_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_lh_unb_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_lh_unb_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_lh_unb_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Her2k_lh.h
// begin FLA_Her2k_ln.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Her2k_ln_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_ln_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_ln_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_ln_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_ln_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_ln_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_ln_blk_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_ln_blk_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_ln_blk_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_ln_blk_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );

FLA_Error FLA_Her2k_ln_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_ln_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_ln_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_ln_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_ln_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_ln_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_ln_unb_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_ln_unb_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_ln_unb_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_ln_unb_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Her2k_ln.h
// begin FLA_Her2k_uh.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Her2k_uh_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_uh_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_uh_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_uh_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_uh_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_uh_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_uh_blk_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_uh_blk_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_uh_blk_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_uh_blk_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );

FLA_Error FLA_Her2k_uh_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_uh_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_uh_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_uh_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_uh_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_uh_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_uh_unb_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_uh_unb_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_uh_unb_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_uh_unb_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Her2k_uh.h
// begin FLA_Her2k_un.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Her2k_un_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_un_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_un_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_un_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_un_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_un_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_un_blk_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_un_blk_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_un_blk_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_un_blk_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );

FLA_Error FLA_Her2k_un_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_un_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_un_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_un_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_un_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_un_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_un_unb_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_un_unb_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_un_unb_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Her2k_un_unb_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Her2k_un.h

FLA_Error FLA_Her2k_internal( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );

FLA_Error FLA_Her2k_lh( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_ln( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_uh( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );
FLA_Error FLA_Her2k_un( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl );

// end FLA_Her2k.h
// begin FLA_Symm.h


// begin FLA_Symm_ll.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Symm_ll_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ll_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ll_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ll_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ll_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ll_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ll_blk_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ll_blk_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ll_blk_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ll_blk_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );

FLA_Error FLA_Symm_ll_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_ll_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_ll_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_ll_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_ll_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_ll_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_ll_unb_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_ll_unb_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_ll_unb_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_ll_unb_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Symm_ll.h
// begin FLA_Symm_lu.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Symm_lu_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_lu_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_lu_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_lu_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_lu_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_lu_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_lu_blk_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_lu_blk_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_lu_blk_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_lu_blk_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );

FLA_Error FLA_Symm_lu_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_lu_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_lu_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_lu_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_lu_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_lu_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_lu_unb_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_lu_unb_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_lu_unb_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_lu_unb_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Symm_lu.h
// begin FLA_Symm_rl.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Symm_rl_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_rl_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_rl_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_rl_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_rl_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_rl_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_rl_blk_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_rl_blk_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_rl_blk_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_rl_blk_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );

FLA_Error FLA_Symm_rl_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_rl_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_rl_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_rl_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_rl_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_rl_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_rl_unb_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_rl_unb_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_rl_unb_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_rl_unb_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Symm_rl.h
// begin FLA_Symm_ru.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Symm_ru_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ru_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ru_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ru_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ru_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ru_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ru_blk_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ru_blk_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ru_blk_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ru_blk_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );

FLA_Error FLA_Symm_ru_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_ru_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_ru_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_ru_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_ru_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_ru_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_ru_unb_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_ru_unb_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_ru_unb_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Symm_ru_unb_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Symm_ru.h

FLA_Error FLA_Symm_internal( FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );

FLA_Error FLA_Symm_ll( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_lu( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_rl( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );
FLA_Error FLA_Symm_ru( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl );

// end FLA_Symm.h
// begin FLA_Syrk.h


// begin FLA_Syrk_ln.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Syrk_ln_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_ln_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_ln_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_ln_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_ln_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_ln_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );

FLA_Error FLA_Syrk_ln_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_ln_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_ln_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_ln_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_ln_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_ln_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );

// end FLA_Syrk_ln.h
// begin FLA_Syrk_lt.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Syrk_lt_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_lt_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_lt_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_lt_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_lt_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_lt_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );

FLA_Error FLA_Syrk_lt_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_lt_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_lt_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_lt_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_lt_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_lt_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );

// end FLA_Syrk_lt.h
// begin FLA_Syrk_un.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Syrk_un_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_un_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_un_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_un_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_un_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_un_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );

FLA_Error FLA_Syrk_un_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_un_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_un_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_un_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_un_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_un_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );

// end FLA_Syrk_un.h
// begin FLA_Syrk_ut.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Syrk_ut_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_ut_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_ut_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_ut_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_ut_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_ut_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );

FLA_Error FLA_Syrk_ut_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_ut_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_ut_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_ut_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_ut_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syrk_ut_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );

// end FLA_Syrk_ut.h

FLA_Error FLA_Syrk_internal( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );

FLA_Error FLA_Syrk_ln( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_lt( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_un( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );
FLA_Error FLA_Syrk_ut( FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl );

// end FLA_Syrk.h
// begin FLA_Syr2k.h


// begin FLA_Syr2k_ln.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Syr2k_ln_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ln_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ln_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ln_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ln_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ln_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ln_blk_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ln_blk_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ln_blk_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ln_blk_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );

FLA_Error FLA_Syr2k_ln_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_ln_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_ln_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_ln_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_ln_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_ln_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_ln_unb_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_ln_unb_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_ln_unb_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_ln_unb_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Syr2k_ln.h
// begin FLA_Syr2k_lt.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Syr2k_lt_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_lt_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_lt_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_lt_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_lt_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_lt_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_lt_blk_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_lt_blk_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_lt_blk_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_lt_blk_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );

FLA_Error FLA_Syr2k_lt_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_lt_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_lt_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_lt_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_lt_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_lt_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_lt_unb_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_lt_unb_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_lt_unb_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_lt_unb_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Syr2k_lt.h
// begin FLA_Syr2k_un.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Syr2k_un_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_un_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_un_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_un_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_un_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_un_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_un_blk_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_un_blk_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_un_blk_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_un_blk_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );

FLA_Error FLA_Syr2k_un_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_un_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_un_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_un_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_un_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_un_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_un_unb_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_un_unb_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_un_unb_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_un_unb_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Syr2k_un.h
// begin FLA_Syr2k_ut.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Syr2k_ut_blk_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ut_blk_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ut_blk_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ut_blk_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ut_blk_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ut_blk_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ut_blk_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ut_blk_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ut_blk_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ut_blk_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );

FLA_Error FLA_Syr2k_ut_unb_var1( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_ut_unb_var2( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_ut_unb_var3( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_ut_unb_var4( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_ut_unb_var5( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_ut_unb_var6( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_ut_unb_var7( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_ut_unb_var8( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_ut_unb_var9( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLA_Syr2k_ut_unb_var10( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );

// end FLA_Syr2k_ut.h

FLA_Error FLA_Syr2k_internal( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );

FLA_Error FLA_Syr2k_ln( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_lt( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_un( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );
FLA_Error FLA_Syr2k_ut( FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl );

// end FLA_Syr2k.h
// begin FLA_Trmm.h


// begin FLA_Trmm_llc.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trmm_llc_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_llc_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_llc_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_llc_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_llc_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_llc_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_llc_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_llc_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trmm_llc.h
// begin FLA_Trmm_llh.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trmm_llh_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_llh_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_llh_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_llh_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_llh_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_llh_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_llh_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_llh_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trmm_llh.h
// begin FLA_Trmm_lln.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trmm_lln_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_lln_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_lln_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_lln_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_lln_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_lln_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_lln_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_lln_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trmm_lln.h
// begin FLA_Trmm_llt.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trmm_llt_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_llt_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_llt_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_llt_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_llt_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_llt_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_llt_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_llt_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trmm_llt.h
// begin FLA_Trmm_luc.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trmm_luc_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_luc_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_luc_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_luc_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_luc_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_luc_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_luc_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_luc_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trmm_luc.h
// begin FLA_Trmm_luh.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trmm_luh_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_luh_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_luh_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_luh_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_luh_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_luh_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_luh_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_luh_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trmm_luh.h
// begin FLA_Trmm_lun.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trmm_lun_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_lun_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_lun_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_lun_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_lun_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_lun_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_lun_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_lun_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trmm_lun.h
// begin FLA_Trmm_lut.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trmm_lut_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_lut_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_lut_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_lut_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_lut_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_lut_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_lut_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_lut_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trmm_lut.h
// begin FLA_Trmm_rlc.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trmm_rlc_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rlc_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rlc_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rlc_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rlc_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_rlc_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_rlc_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_rlc_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trmm_rlc.h
// begin FLA_Trmm_rlh.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trmm_rlh_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rlh_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rlh_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rlh_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rlh_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_rlh_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_rlh_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_rlh_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trmm_rlh.h
// begin FLA_Trmm_rln.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trmm_rln_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rln_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rln_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rln_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rln_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_rln_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_rln_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_rln_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trmm_rln.h
// begin FLA_Trmm_rlt.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trmm_rlt_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rlt_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rlt_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rlt_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rlt_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_rlt_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_rlt_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_rlt_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trmm_rlt.h
// begin FLA_Trmm_ruc.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trmm_ruc_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_ruc_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_ruc_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_ruc_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_ruc_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_ruc_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_ruc_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_ruc_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trmm_ruc.h
// begin FLA_Trmm_ruh.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trmm_ruh_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_ruh_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_ruh_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_ruh_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_ruh_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_ruh_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_ruh_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_ruh_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trmm_ruh.h
// begin FLA_Trmm_run.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trmm_run_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_run_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_run_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_run_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_run_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_run_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_run_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_run_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trmm_run.h
// begin FLA_Trmm_rut.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trmm_rut_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rut_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rut_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rut_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rut_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_rut_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_rut_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trmm_rut_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trmm_rut.h

FLA_Error FLA_Trmm_internal( FLA_Side side, FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );

FLA_Error FLA_Trmm_llc( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_llh( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_lln( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_llt( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_luc( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_luh( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_lun( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_lut( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rlc( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rlh( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rln( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rlt( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_ruc( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_ruh( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_run( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );
FLA_Error FLA_Trmm_rut( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trmm_t* cntl );

// end FLA_Trmm.h
// begin FLA_Trsm.h


// begin FLA_Trsm_llc.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsm_llc_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_llc_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_llc_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_llc_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_llc_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_llc_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_llc_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_llc_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trsm_llc.h
// begin FLA_Trsm_llh.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsm_llh_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_llh_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_llh_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_llh_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_llh_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_llh_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_llh_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_llh_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trsm_llh.h
// begin FLA_Trsm_lln.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsm_lln_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_lln_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_lln_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_lln_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_lln_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_lln_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_lln_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_lln_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trsm_lln.h
// begin FLA_Trsm_llt.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsm_llt_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_llt_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_llt_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_llt_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_llt_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_llt_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_llt_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_llt_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trsm_llt.h
// begin FLA_Trsm_luc.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsm_luc_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_luc_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_luc_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_luc_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_luc_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_luc_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_luc_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_luc_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trsm_luc.h
// begin FLA_Trsm_luh.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsm_luh_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_luh_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_luh_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_luh_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_luh_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_luh_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_luh_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_luh_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trsm_luh.h
// begin FLA_Trsm_lun.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsm_lun_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_lun_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_lun_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_lun_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_lun_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_lun_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_lun_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_lun_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trsm_lun.h
// begin FLA_Trsm_lut.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsm_lut_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_lut_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_lut_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_lut_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_lut_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_lut_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_lut_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_lut_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trsm_lut.h
// begin FLA_Trsm_rlc.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsm_rlc_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rlc_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rlc_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rlc_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rlc_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_rlc_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_rlc_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_rlc_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trsm_rlc.h
// begin FLA_Trsm_rlh.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsm_rlh_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rlh_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rlh_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rlh_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rlh_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_rlh_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_rlh_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_rlh_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trsm_rlh.h
// begin FLA_Trsm_rln.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsm_rln_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rln_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rln_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rln_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rln_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_rln_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_rln_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_rln_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trsm_rln.h
// begin FLA_Trsm_rlt.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsm_rlt_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rlt_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rlt_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rlt_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rlt_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_rlt_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_rlt_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_rlt_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trsm_rlt.h
// begin FLA_Trsm_ruc.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsm_ruc_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_ruc_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_ruc_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_ruc_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_ruc_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_ruc_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_ruc_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_ruc_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trsm_ruc.h
// begin FLA_Trsm_ruh.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsm_ruh_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_ruh_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_ruh_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_ruh_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_ruh_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_ruh_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_ruh_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_ruh_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trsm_ruh.h
// begin FLA_Trsm_run.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsm_run_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_run_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_run_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_run_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_run_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_run_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_run_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_run_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trsm_run.h
// begin FLA_Trsm_rut.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trsm_rut_blk_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rut_blk_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rut_blk_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rut_blk_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rut_unb_var1( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_rut_unb_var2( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_rut_unb_var3( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLA_Trsm_rut_unb_var4( FLA_Diag diagA, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLA_Trsm_rut.h

FLA_Error FLA_Trsm_internal( FLA_Side side, FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );

FLA_Error FLA_Trsm_llc( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_llh( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_lln( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_llt( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_luc( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_luh( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_lun( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_lut( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rlc( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rlh( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rln( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rlt( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_ruc( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_ruh( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_run( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );
FLA_Error FLA_Trsm_rut( FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_trsm_t* cntl );

// end FLA_Trsm.h

// end FLA_blas_var_prototypes.h
// begin FLA_lapack_var_prototypes.h


// Factorizations
// begin FLA_Chol.h


// begin FLA_Chol_l.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Chol_l_blk_var1( FLA_Obj A, fla_chol_t* cntl );
FLA_Error FLA_Chol_l_blk_var2( FLA_Obj A, fla_chol_t* cntl );
FLA_Error FLA_Chol_l_blk_var3( FLA_Obj A, fla_chol_t* cntl );

FLA_Error FLA_Chol_l_unb_var1( FLA_Obj A );
FLA_Error FLA_Chol_l_unb_var2( FLA_Obj A );
FLA_Error FLA_Chol_l_unb_var3( FLA_Obj A );

FLA_Error FLA_Chol_l_opt_var1( FLA_Obj A );
FLA_Error FLA_Chol_l_ops_var1( int mn_A,
                               float*    A, int rs_A, int cs_A );
FLA_Error FLA_Chol_l_opd_var1( int mn_A,
                               double*   A, int rs_A, int cs_A );
FLA_Error FLA_Chol_l_opc_var1( int mn_A,
                               scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Chol_l_opz_var1( int mn_A,
                               dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Chol_l_opt_var2( FLA_Obj A );
FLA_Error FLA_Chol_l_ops_var2( int mn_A,
                               float*    A, int rs_A, int cs_A );
FLA_Error FLA_Chol_l_opd_var2( int mn_A,
                               double*   A, int rs_A, int cs_A );
FLA_Error FLA_Chol_l_opc_var2( int mn_A,
                               scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Chol_l_opz_var2( int mn_A,
                               dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Chol_l_opt_var3( FLA_Obj A );
FLA_Error FLA_Chol_l_ops_var3( int mn_A,
                               float*    A, int rs_A, int cs_A );
FLA_Error FLA_Chol_l_opd_var3( int mn_A,
                               double*   A, int rs_A, int cs_A );
FLA_Error FLA_Chol_l_opc_var3( int mn_A,
                               scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Chol_l_opz_var3( int mn_A,
                               dcomplex* A, int rs_A, int cs_A );

// end FLA_Chol_l.h
// begin FLA_Chol_u.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Chol_u_blk_var1( FLA_Obj A, fla_chol_t* cntl );
FLA_Error FLA_Chol_u_blk_var2( FLA_Obj A, fla_chol_t* cntl );
FLA_Error FLA_Chol_u_blk_var3( FLA_Obj A, fla_chol_t* cntl );

FLA_Error FLA_Chol_u_unb_var1( FLA_Obj A );
FLA_Error FLA_Chol_u_unb_var2( FLA_Obj A );
FLA_Error FLA_Chol_u_unb_var3( FLA_Obj A );

FLA_Error FLA_Chol_u_opt_var1( FLA_Obj A );
FLA_Error FLA_Chol_u_ops_var1( int mn_A,
                               float*    A, int rs_A, int cs_A );
FLA_Error FLA_Chol_u_opd_var1( int mn_A,
                               double*   A, int rs_A, int cs_A );
FLA_Error FLA_Chol_u_opc_var1( int mn_A,
                               scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Chol_u_opz_var1( int mn_A,
                               dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Chol_u_opt_var2( FLA_Obj A );
FLA_Error FLA_Chol_u_ops_var2( int mn_A,
                               float*    A, int rs_A, int cs_A );
FLA_Error FLA_Chol_u_opd_var2( int mn_A,
                               double*   A, int rs_A, int cs_A );
FLA_Error FLA_Chol_u_opc_var2( int mn_A,
                               scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Chol_u_opz_var2( int mn_A,
                               dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Chol_u_opt_var3( FLA_Obj A );
FLA_Error FLA_Chol_u_ops_var3( int mn_A,
                               float*    A, int rs_A, int cs_A );
FLA_Error FLA_Chol_u_opd_var3( int mn_A,
                               double*   A, int rs_A, int cs_A );
FLA_Error FLA_Chol_u_opc_var3( int mn_A,
                               scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Chol_u_opz_var3( int mn_A,
                               dcomplex* A, int rs_A, int cs_A );

// end FLA_Chol_u.h

FLA_Error FLA_Chol_internal( FLA_Uplo uplo, FLA_Obj A, fla_chol_t* cntl );
FLA_Error FLA_Chol_l( FLA_Obj A, fla_chol_t* cntl );
FLA_Error FLA_Chol_u( FLA_Obj A, fla_chol_t* cntl );

FLA_Error FLA_Chol_solve( FLA_Uplo uplo, FLA_Obj A, FLA_Obj B, FLA_Obj X );
FLA_Error FLASH_Chol_solve( FLA_Uplo uplo, FLA_Obj A, FLA_Obj B, FLA_Obj X );
// end FLA_Chol.h
// begin FLA_LU_nopiv.h


// begin FLA_LU_nopiv_vars.h


// skipped #include "FLAME.h" 

FLA_Error FLA_LU_nopiv_blk_var1( FLA_Obj A, fla_lu_t* cntl );
FLA_Error FLA_LU_nopiv_blk_var2( FLA_Obj A, fla_lu_t* cntl );
FLA_Error FLA_LU_nopiv_blk_var3( FLA_Obj A, fla_lu_t* cntl );
FLA_Error FLA_LU_nopiv_blk_var4( FLA_Obj A, fla_lu_t* cntl );
FLA_Error FLA_LU_nopiv_blk_var5( FLA_Obj A, fla_lu_t* cntl );

FLA_Error FLA_LU_nopiv_unb_var1( FLA_Obj A );
FLA_Error FLA_LU_nopiv_unb_var2( FLA_Obj A );
FLA_Error FLA_LU_nopiv_unb_var3( FLA_Obj A );
FLA_Error FLA_LU_nopiv_unb_var4( FLA_Obj A );
FLA_Error FLA_LU_nopiv_unb_var5( FLA_Obj A );

FLA_Error FLA_LU_nopiv_opt_var1( FLA_Obj A );
FLA_Error FLA_LU_nopiv_ops_var1( int m_A,
                                 int n_A,
                                 float* A, int rs_A, int cs_A );
FLA_Error FLA_LU_nopiv_opd_var1( int m_A,
                                 int n_A,
                                 double* A, int rs_A, int cs_A );
FLA_Error FLA_LU_nopiv_opc_var1( int m_A,
                                 int n_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_LU_nopiv_opz_var1( int m_A,
                                 int n_A,
                                 dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_LU_nopiv_opt_var2( FLA_Obj A );
FLA_Error FLA_LU_nopiv_ops_var2( int m_A,
                                 int n_A,
                                 float* A, int rs_A, int cs_A );
FLA_Error FLA_LU_nopiv_opd_var2( int m_A,
                                 int n_A,
                                 double* A, int rs_A, int cs_A );
FLA_Error FLA_LU_nopiv_opc_var2( int m_A,
                                 int n_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_LU_nopiv_opz_var2( int m_A,
                                 int n_A,
                                 dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_LU_nopiv_opt_var3( FLA_Obj A );
FLA_Error FLA_LU_nopiv_ops_var3( int m_A,
                                 int n_A,
                                 float* A, int rs_A, int cs_A );
FLA_Error FLA_LU_nopiv_opd_var3( int m_A,
                                 int n_A,
                                 double* A, int rs_A, int cs_A );
FLA_Error FLA_LU_nopiv_opc_var3( int m_A,
                                 int n_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_LU_nopiv_opz_var3( int m_A,
                                 int n_A,
                                 dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_LU_nopiv_opt_var4( FLA_Obj A );
FLA_Error FLA_LU_nopiv_ops_var4( int m_A,
                                 int n_A,
                                 float* A, int rs_A, int cs_A );
FLA_Error FLA_LU_nopiv_opd_var4( int m_A,
                                 int n_A,
                                 double* A, int rs_A, int cs_A );
FLA_Error FLA_LU_nopiv_opc_var4( int m_A,
                                 int n_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_LU_nopiv_opz_var4( int m_A,
                                 int n_A,
                                 dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_LU_nopiv_opt_var5( FLA_Obj A );
FLA_Error FLA_LU_nopiv_ops_var5( int m_A,
                                 int n_A,
                                 float* A, int rs_A, int cs_A );
FLA_Error FLA_LU_nopiv_opd_var5( int m_A,
                                 int n_A,
                                 double* A, int rs_A, int cs_A );
FLA_Error FLA_LU_nopiv_opc_var5( int m_A,
                                 int n_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_LU_nopiv_opz_var5( int m_A,
                                 int n_A,
                                 dcomplex* A, int rs_A, int cs_A );

// end FLA_LU_nopiv_vars.h

FLA_Error FLA_LU_nopiv_internal( FLA_Obj A, fla_lu_t* cntl );

FLA_Error FLA_LU_nopiv_solve( FLA_Obj A, FLA_Obj B, FLA_Obj X );
FLA_Error FLASH_LU_nopiv_solve( FLA_Obj A, FLA_Obj B, FLA_Obj X );
// end FLA_LU_nopiv.h
// begin FLA_LU_piv.h


// begin FLA_LU_piv_vars.h


// skipped #include "FLAME.h" 

FLA_Error FLA_LU_piv_blk_var3( FLA_Obj A, FLA_Obj p, fla_lu_t* cntl );
FLA_Error FLA_LU_piv_blk_var4( FLA_Obj A, FLA_Obj p, fla_lu_t* cntl );
FLA_Error FLA_LU_piv_blk_var5( FLA_Obj A, FLA_Obj p, fla_lu_t* cntl );

FLA_Error FLA_LU_piv_unb_var3( FLA_Obj A, FLA_Obj p );
FLA_Error FLA_LU_piv_unb_var3b( FLA_Obj A, FLA_Obj p );
FLA_Error FLA_LU_piv_unb_var4( FLA_Obj A, FLA_Obj p );
FLA_Error FLA_LU_piv_unb_var5( FLA_Obj A, FLA_Obj p );

FLA_Error FLA_LU_piv_opt_var3( FLA_Obj A, FLA_Obj p );
FLA_Error FLA_LU_piv_ops_var3( int m_A,
                               int n_A,
                               float*    buff_A, int rs_A, int cs_A,
                               int*      buff_p, int inc_p );
FLA_Error FLA_LU_piv_opd_var3( int m_A,
                               int n_A,
                               double*   buff_A, int rs_A, int cs_A,
                               int*      buff_p, int inc_p );
FLA_Error FLA_LU_piv_opc_var3( int m_A,
                               int n_A,
                               scomplex* buff_A, int rs_A, int cs_A,
                               int*      buff_p, int inc_p );
FLA_Error FLA_LU_piv_opz_var3( int m_A,
                               int n_A,
                               dcomplex* buff_A, int rs_A, int cs_A,
                               int*      buff_p, int inc_p );

FLA_Error FLA_LU_piv_opt_var4( FLA_Obj A, FLA_Obj p );
FLA_Error FLA_LU_piv_ops_var4( int m_A,
                               int n_A,
                               float*    buff_A, int rs_A, int cs_A,
                               int*      buff_p, int inc_p );
FLA_Error FLA_LU_piv_opd_var4( int m_A,
                               int n_A,
                               double*   buff_A, int rs_A, int cs_A,
                               int*      buff_p, int inc_p );
FLA_Error FLA_LU_piv_opc_var4( int m_A,
                               int n_A,
                               scomplex* buff_A, int rs_A, int cs_A,
                               int*      buff_p, int inc_p );
FLA_Error FLA_LU_piv_opz_var4( int m_A,
                               int n_A,
                               dcomplex* buff_A, int rs_A, int cs_A,
                               int*      buff_p, int inc_p );

FLA_Error FLA_LU_piv_opt_var5( FLA_Obj A, FLA_Obj p );
FLA_Error FLA_LU_piv_ops_var5( int m_A,
                               int n_A,
                               float*    buff_A, int rs_A, int cs_A,
                               int*      buff_p, int inc_p );
FLA_Error FLA_LU_piv_opd_var5( int m_A,
                               int n_A,
                               double*   buff_A, int rs_A, int cs_A,
                               int*      buff_p, int inc_p );
FLA_Error FLA_LU_piv_opc_var5( int m_A,
                               int n_A,
                               scomplex* buff_A, int rs_A, int cs_A,
                               int*      buff_p, int inc_p );
FLA_Error FLA_LU_piv_opz_var5( int m_A,
                               int n_A,
                               dcomplex* buff_A, int rs_A, int cs_A,
                               int*      buff_p, int inc_p );
// end FLA_LU_piv_vars.h

FLA_Error FLA_LU_piv_internal( FLA_Obj A, FLA_Obj p, fla_lu_t* cntl );

FLA_Error FLA_LU_piv_solve( FLA_Obj A, FLA_Obj p, FLA_Obj B, FLA_Obj X );
FLA_Error FLASH_LU_piv_solve( FLA_Obj A, FLA_Obj p, FLA_Obj B, FLA_Obj X );
// end FLA_LU_piv.h
// begin FLA_LU_incpiv.h


// begin FLA_LU_incpiv_aux.h


// skipped #include "FLAME.h" 

FLA_Error FLA_SA_Apply_pivots( FLA_Obj C, FLA_Obj E, FLA_Obj p );
FLA_Error FLA_SA_LU_blk( FLA_Obj U,
                         FLA_Obj D, FLA_Obj p, FLA_Obj L, dim_t nb_alg );
FLA_Error FLA_SA_LU_unb( FLA_Obj U, 
                         FLA_Obj D, FLA_Obj p, FLA_Obj L );
FLA_Error FLA_SA_FS_blk( FLA_Obj L,
                         FLA_Obj D, FLA_Obj p, FLA_Obj C,
                                               FLA_Obj E, dim_t nb_alg );

FLA_Error FLASH_LU_incpiv_var1( FLA_Obj A, FLA_Obj p, FLA_Obj L, dim_t nb_alg, fla_lu_t* cntl );
FLA_Error FLASH_LU_incpiv_var2( FLA_Obj A, FLA_Obj p, FLA_Obj L, FLA_Obj U, dim_t nb_alg, fla_lu_t* cntl );
FLA_Error FLASH_Trsm_piv( FLA_Obj A, FLA_Obj B, FLA_Obj p, fla_trsm_t* cntl );
FLA_Error FLASH_SA_LU( FLA_Obj B, FLA_Obj C,
                       FLA_Obj D, FLA_Obj E, FLA_Obj p, FLA_Obj L, dim_t nb_alg, fla_lu_t* cntl );
FLA_Error FLASH_SA_FS( FLA_Obj L,
                       FLA_Obj D, FLA_Obj p, FLA_Obj C,
                                             FLA_Obj E, dim_t nb_alg, fla_gemm_t* cntl );

FLA_Error FLASH_FS_incpiv_aux1( FLA_Obj A, FLA_Obj p, FLA_Obj L, FLA_Obj b, dim_t nb_alg );
FLA_Error FLASH_FS_incpiv_aux2( FLA_Obj L,
                                FLA_Obj D, FLA_Obj p, FLA_Obj C,
                                                      FLA_Obj E, dim_t nb_alg );

// end FLA_LU_incpiv_aux.h

FLA_Error FLASH_LU_incpiv_create_hier_matrices( FLA_Obj A_flat, dim_t depth, dim_t* b_flash, dim_t b_alg, FLA_Obj* A, FLA_Obj* p, FLA_Obj* L );
dim_t     FLASH_LU_incpiv_determine_alg_blocksize( FLA_Obj A );

FLA_Error FLASH_LU_incpiv_noopt( FLA_Obj A, FLA_Obj p, FLA_Obj L );
FLA_Error FLASH_LU_incpiv_opt1( FLA_Obj A, FLA_Obj p, FLA_Obj L );

FLA_Error FLASH_LU_incpiv_solve( FLA_Obj A, FLA_Obj p, FLA_Obj L, FLA_Obj B, FLA_Obj X );
// end FLA_LU_incpiv.h
// begin FLA_QR_UT.h


// begin FLA_QR_UT_vars.h


FLA_Error FLA_QR_UT_unb_var1( FLA_Obj A, FLA_Obj t );
FLA_Error FLA_QR_UT_blk_var1( FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl );
FLA_Error FLA_QR_UT_opt_var1( FLA_Obj A, FLA_Obj t );
FLA_Error FLA_QR_UT_ops_var1( int m_A,
                              int n_A,
                              float* A, int rs_A, int cs_A,
                              float* t, int inc_t );
FLA_Error FLA_QR_UT_opd_var1( int m_A,
                              int n_A,
                              double* A, int rs_A, int cs_A,
                              double* t, int inc_t );
FLA_Error FLA_QR_UT_opc_var1( int m_A,
                              int n_A,
                              scomplex* A, int rs_A, int cs_A,
                              scomplex* t, int inc_t );
FLA_Error FLA_QR_UT_opz_var1( int m_A,
                              int n_A,
                              dcomplex* A, int rs_A, int cs_A,
                              dcomplex* t, int inc_t );

FLA_Error FLA_QR_UT_unb_var2( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_QR_UT_blk_var2( FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl );
FLA_Error FLA_QR_UT_opt_var2( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_QR_UT_ops_var2( int m_A,
                              int n_A,
                              float* A, int rs_A, int cs_A,
                              float* T, int rs_T, int cs_T );
FLA_Error FLA_QR_UT_opd_var2( int m_A,
                              int n_A,
                              double* A, int rs_A, int cs_A,
                              double* T, int rs_T, int cs_T );
FLA_Error FLA_QR_UT_opc_var2( int m_A,
                              int n_A,
                              scomplex* A, int rs_A, int cs_A,
                              scomplex* T, int rs_T, int cs_T );
FLA_Error FLA_QR_UT_opz_var2( int m_A,
                              int n_A,
                              dcomplex* A, int rs_A, int cs_A,
                              dcomplex* T, int rs_T, int cs_T );

FLA_Error FLA_QR_UT_blk_var3( FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl );

// end FLA_QR_UT_vars.h

FLA_Error FLA_QR_UT( FLA_Obj A, FLA_Obj T );

FLA_Error FLA_QR_UT_internal( FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl );
FLA_Error FLA_QR_UT_copy_internal( FLA_Obj A, FLA_Obj T, FLA_Obj U, fla_qrut_t* cntl );

FLA_Error FLA_QR_UT_create_T( FLA_Obj A, FLA_Obj* T );

FLA_Error FLA_QR_UT_recover_tau( FLA_Obj T, FLA_Obj tau );

FLA_Error FLA_QR_UT_solve( FLA_Obj A, FLA_Obj T, FLA_Obj B, FLA_Obj X );

FLA_Error FLASH_QR_UT( FLA_Obj A, FLA_Obj TW );
FLA_Error FLASH_QR_UT_create_hier_matrices( FLA_Obj A_flat, dim_t depth, dim_t* b_flash, FLA_Obj* A, FLA_Obj* TW );
FLA_Error FLASH_QR_UT_solve( FLA_Obj A, FLA_Obj T, FLA_Obj B, FLA_Obj X );


FLA_Error FLA_QR_UT_form_Q( FLA_Obj A, FLA_Obj T, FLA_Obj Q );
FLA_Error FLA_QR_UT_form_Q_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W );
FLA_Error FLA_QR_UT_form_Q_opt_var1( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_QR_UT_form_Q_ops_var1( int       m_A,
                                     int       n_AT,
                                     float*    buff_A, int rs_A, int cs_A,
                                     float*    buff_T, int rs_T, int cs_T );
FLA_Error FLA_QR_UT_form_Q_opd_var1( int       m_A,
                                     int       n_AT,
                                     double*   buff_A, int rs_A, int cs_A,
                                     double*   buff_T, int rs_T, int cs_T );
FLA_Error FLA_QR_UT_form_Q_opc_var1( int       m_A,
                                     int       n_AT,
                                     scomplex* buff_A, int rs_A, int cs_A,
                                     scomplex* buff_T, int rs_T, int cs_T );
FLA_Error FLA_QR_UT_form_Q_opz_var1( int       m_A,
                                     int       n_AT,
                                     dcomplex* buff_A, int rs_A, int cs_A,
                                     dcomplex* buff_T, int rs_T, int cs_T );
// end FLA_QR_UT.h
// begin FLA_QR_UT_piv.h


// begin FLA_QR_UT_piv_vars.h



// BLAS 2 version
FLA_Error FLA_QR_UT_piv_unb_var1( FLA_Obj A, FLA_Obj T, FLA_Obj w, FLA_Obj p );
FLA_Error FLA_QR_UT_piv_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj w, FLA_Obj p, fla_qrut_t* cntl );

// BLAS 3 version
FLA_Error FLA_QR_UT_piv_unb_var2( FLA_Obj A, FLA_Obj T, FLA_Obj w, FLA_Obj p );
FLA_Error FLA_QR_UT_piv_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj w, FLA_Obj p, fla_qrut_t* cntl );
FLA_Error FLA_Apply_H2_UT_piv_row( FLA_Obj tau, FLA_Obj a1t, FLA_Obj u1t, FLA_Obj W,
                                   FLA_Obj u2,  FLA_Obj A2,  FLA_Obj U2,  FLA_Obj w1t,
                                   FLA_Obj vt );

// end FLA_QR_UT_piv_vars.h

FLA_Error FLA_QR_UT_piv( FLA_Obj A, FLA_Obj T, FLA_Obj w, FLA_Obj p );

FLA_Error FLA_QR_UT_piv_internal( FLA_Obj A, FLA_Obj T, FLA_Obj w, FLA_Obj p, fla_qrut_t* cntl );
FLA_Error FLA_QR_UT_piv_colnorm( FLA_Obj alpha, FLA_Obj A, FLA_Obj b );

// The source files are located at src/base/flamec/check/lapack
FLA_Error FLA_QR_UT_piv_check( FLA_Obj A, FLA_Obj T, FLA_Obj w, FLA_Obj p );
FLA_Error FLA_QR_UT_piv_internal_check( FLA_Obj A, FLA_Obj T, FLA_Obj w, FLA_Obj p, fla_qrut_t* cntl );
FLA_Error FLA_QR_UT_piv_colnorm_check( FLA_Obj alpha, FLA_Obj A, FLA_Obj b );
// end FLA_QR_UT_piv.h
// begin FLA_QR2_UT.h


// begin FLA_QR2_UT_vars.h


FLA_Error FLA_QR2_UT_blk_var1( FLA_Obj U,
                               FLA_Obj D, FLA_Obj T, fla_qr2ut_t* cntl );
FLA_Error FLA_QR2_UT_blk_var2( FLA_Obj U,
                               FLA_Obj D, FLA_Obj T, fla_qr2ut_t* cntl );

FLA_Error FLA_QR2_UT_unb_var1( FLA_Obj U,
                               FLA_Obj D, FLA_Obj T );

FLA_Error FLA_QR2_UT_opt_var1( FLA_Obj U,
                               FLA_Obj D, FLA_Obj T );

FLA_Error FLA_QR2_UT_ops_var1( int m_UT,
                               int m_D,
                               float* U, int rs_U, int cs_U,
                               float* D, int rs_D, int cs_D,
                               float* T, int rs_T, int cs_T );
FLA_Error FLA_QR2_UT_opd_var1( int m_UT,
                               int m_D,
                               double* U, int rs_U, int cs_U,
                               double* D, int rs_D, int cs_D,
                               double* T, int rs_T, int cs_T );
FLA_Error FLA_QR2_UT_opc_var1( int m_UT,
                               int m_D,
                               scomplex* U, int rs_U, int cs_U,
                               scomplex* D, int rs_D, int cs_D,
                               scomplex* T, int rs_T, int cs_T );
FLA_Error FLA_QR2_UT_opz_var1( int m_UT,
                               int m_D,
                               dcomplex* U, int rs_U, int cs_U,
                               dcomplex* D, int rs_D, int cs_D,
                               dcomplex* T, int rs_T, int cs_T );
// end FLA_QR2_UT_vars.h

FLA_Error FLASH_QR2_UT( FLA_Obj U,
                        FLA_Obj D, FLA_Obj T );

FLA_Error FLA_QR2_UT_internal( FLA_Obj U,
                               FLA_Obj D, FLA_Obj T, fla_qr2ut_t* cntl );

// end FLA_QR2_UT.h
// begin FLA_QR_UT_inc.h


FLA_Error FLASH_QR_UT_inc( FLA_Obj A, FLA_Obj TW );

FLA_Error FLASH_QR_UT_inc_noopt( FLA_Obj A, FLA_Obj TW );
FLA_Error FLASH_QR_UT_inc_opt1( FLA_Obj A, FLA_Obj TW );

FLA_Error FLA_QR_UT_inc_blk_var1( FLA_Obj A, FLA_Obj TW, fla_qrutinc_t* cntl );
FLA_Error FLA_QR_UT_inc_blk_var2( FLA_Obj A, FLA_Obj TW, FLA_Obj U, fla_qrutinc_t* cntl );

FLA_Error FLASH_QR_UT_inc_create_hier_matrices( FLA_Obj A_flat, dim_t depth, dim_t* b_flash, dim_t b_alg, FLA_Obj* A, FLA_Obj* TW );
dim_t     FLASH_QR_UT_inc_determine_alg_blocksize( FLA_Obj A );

FLA_Error FLASH_QR_UT_inc_solve( FLA_Obj A, FLA_Obj TW, FLA_Obj B, FLA_Obj X );

// end FLA_QR_UT_inc.h
// begin FLA_LQ_UT.h


// begin FLA_LQ_UT_vars.h


// skipped #include "FLAME.h" 

FLA_Error FLA_LQ_UT_unb_var1( FLA_Obj A, FLA_Obj t );
FLA_Error FLA_LQ_UT_blk_var1( FLA_Obj A, FLA_Obj T, fla_lqut_t* cntl );
FLA_Error FLA_LQ_UT_opt_var1( FLA_Obj A, FLA_Obj t );
FLA_Error FLA_LQ_UT_ops_var1( int m_A,
                              int n_A,
                              float* A, int rs_A, int cs_A,
                              float* t, int inc_t );
FLA_Error FLA_LQ_UT_opd_var1( int m_A,
                              int n_A,
                              double* A, int rs_A, int cs_A,
                              double* t, int inc_t );
FLA_Error FLA_LQ_UT_opc_var1( int m_A,
                              int n_A,
                              scomplex* A, int rs_A, int cs_A,
                              scomplex* t, int inc_t );
FLA_Error FLA_LQ_UT_opz_var1( int m_A,
                              int n_A,
                              dcomplex* A, int rs_A, int cs_A,
                              dcomplex* t, int inc_t );

FLA_Error FLA_LQ_UT_unb_var2( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_LQ_UT_blk_var2( FLA_Obj A, FLA_Obj T, fla_lqut_t* cntl );
FLA_Error FLA_LQ_UT_opt_var2( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_LQ_UT_ops_var2( int m_A,
                              int n_A,
                              float* A, int rs_A, int cs_A,
                              float* T, int rs_T, int cs_T );
FLA_Error FLA_LQ_UT_opd_var2( int m_A,
                              int n_A,
                              double* A, int rs_A, int cs_A,
                              double* T, int rs_T, int cs_T );
FLA_Error FLA_LQ_UT_opc_var2( int m_A,
                              int n_A,
                              scomplex* A, int rs_A, int cs_A,
                              scomplex* T, int rs_T, int cs_T );
FLA_Error FLA_LQ_UT_opz_var2( int m_A,
                              int n_A,
                              dcomplex* A, int rs_A, int cs_A,
                              dcomplex* T, int rs_T, int cs_T );

FLA_Error FLA_LQ_UT_blk_var3( FLA_Obj A, FLA_Obj T, fla_lqut_t* cntl );

// end FLA_LQ_UT_vars.h

FLA_Error FLA_LQ_UT( FLA_Obj A, FLA_Obj T );

FLA_Error FLA_LQ_UT_internal( FLA_Obj A, FLA_Obj T, fla_lqut_t* cntl );

FLA_Error FLA_LQ_UT_create_T( FLA_Obj A, FLA_Obj* T );

FLA_Error FLA_LQ_UT_recover_tau( FLA_Obj T, FLA_Obj tau );

FLA_Error FLA_LQ_UT_solve( FLA_Obj A, FLA_Obj T, FLA_Obj B, FLA_Obj X );

FLA_Error FLASH_LQ_UT( FLA_Obj A, FLA_Obj TW );
FLA_Error FLASH_LQ_UT_create_hier_matrices( FLA_Obj A_flat, dim_t depth, dim_t* b_flash, FLA_Obj* A, FLA_Obj* TW );
FLA_Error FLASH_LQ_UT_solve( FLA_Obj A, FLA_Obj T, FLA_Obj B, FLA_Obj X );

FLA_Error FLA_LQ_UT_form_Q( FLA_Obj A, FLA_Obj T, FLA_Obj Q );
//FLA_Error FLA_LQ_UT_form_Q_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W );
//FLA_Error FLA_LQ_UT_form_Q_opt_var1( FLA_Obj A, FLA_Obj T );
//FLA_Error FLA_LQ_UT_form_Q_ops_var1( int       m_A,
//                                     int       n_A,
//                                     float*    buff_A, int rs_A, int cs_A,
//                                     float*    buff_T, int rs_T, int cs_T );
//FLA_Error FLA_LQ_UT_form_Q_opd_var1( int       m_A,
//                                     int       n_A,
//                                     double*   buff_A, int rs_A, int cs_A,
//                                     double*   buff_T, int rs_T, int cs_T );
//FLA_Error FLA_LQ_UT_form_Q_opc_var1( int       m_A,
//                                     int       n_A,
//                                     scomplex* buff_A, int rs_A, int cs_A,
//                                     scomplex* buff_T, int rs_T, int cs_T );
//FLA_Error FLA_LQ_UT_form_Q_opz_var1( int       m_A,
//                                     int       n_A,
//                                     dcomplex* buff_A, int rs_A, int cs_A,
//                                     dcomplex* buff_T, int rs_T, int cs_T );
// end FLA_LQ_UT.h
// begin FLA_CAQR2_UT.h


// begin FLA_CAQR2_UT_vars.h


FLA_Error FLA_CAQR2_UT_blk_var1( FLA_Obj U,
                                 FLA_Obj D, FLA_Obj T, fla_caqr2ut_t* cntl );
FLA_Error FLA_CAQR2_UT_blk_var2( FLA_Obj U,
                                 FLA_Obj D, FLA_Obj T, fla_caqr2ut_t* cntl );

FLA_Error FLA_CAQR2_UT_unb_var1( FLA_Obj U,
                                 FLA_Obj D, FLA_Obj T );

FLA_Error FLA_CAQR2_UT_opt_var1( FLA_Obj U,
                                 FLA_Obj D, FLA_Obj T );
FLA_Error FLA_CAQR2_UT_ops_var1( int m_UT,
                                 int m_D,
                                 float* U, int rs_U, int cs_U,
                                 float* D, int rs_D, int cs_D,
                                 float* T, int rs_T, int cs_T );
FLA_Error FLA_CAQR2_UT_opd_var1( int m_UT,
                                 int m_D,
                                 double* U, int rs_U, int cs_U,
                                 double* D, int rs_D, int cs_D,
                                 double* T, int rs_T, int cs_T );
FLA_Error FLA_CAQR2_UT_opc_var1( int m_UT,
                                 int m_D,
                                 scomplex* U, int rs_U, int cs_U,
                                 scomplex* D, int rs_D, int cs_D,
                                 scomplex* T, int rs_T, int cs_T );
FLA_Error FLA_CAQR2_UT_opz_var1( int m_UT,
                                 int m_D,
                                 dcomplex* U, int rs_U, int cs_U,
                                 dcomplex* D, int rs_D, int cs_D,
                                 dcomplex* T, int rs_T, int cs_T );

// end FLA_CAQR2_UT_vars.h

FLA_Error FLA_CAQR2_UT_internal( FLA_Obj U,
                                 FLA_Obj D, FLA_Obj T, fla_caqr2ut_t* cntl );

// end FLA_CAQR2_UT.h
// begin FLA_CAQR_UT_inc.h


FLA_Error FLASH_CAQR_UT_inc( dim_t p, FLA_Obj A, FLA_Obj ATW, FLA_Obj R, FLA_Obj RTW );

FLA_Error FLASH_CAQR_UT_inc_noopt( dim_t p, FLA_Obj A, FLA_Obj ATW, FLA_Obj R, FLA_Obj RTW );

FLA_Error FLASH_CAQR_UT_inc_create_hier_matrices( dim_t p, FLA_Obj A_flat, dim_t depth, dim_t* b_flash, dim_t b_alg, FLA_Obj* A, FLA_Obj* ATW, FLA_Obj* R, FLA_Obj* RTW );
dim_t     FLASH_CAQR_UT_inc_determine_alg_blocksize( FLA_Obj A );
FLA_Error FLASH_CAQR_UT_inc_adjust_views( FLA_Obj A, FLA_Obj TW );

void      FLA_CAQR_UT_inc_init_structure( dim_t p, dim_t nb_part, FLA_Obj R );

dim_t     FLA_CAQR_UT_inc_compute_blocks_per_part( dim_t p, FLA_Obj A );

FLA_Error FLA_CAQR_UT_inc_factorize_panels( dim_t nb_part, FLA_Obj A, FLA_Obj ATW );

FLA_Error FLA_CAQR_UT_inc_copy_triangles( dim_t nb_part, FLA_Obj A, FLA_Obj R );

FLA_Error FLA_CAQR_UT_inc_blk_var1( FLA_Obj R, FLA_Obj TW, fla_caqrutinc_t* cntl );

FLA_Error FLASH_CAQR_UT_inc_solve( dim_t p, FLA_Obj A, FLA_Obj ATW, FLA_Obj R, FLA_Obj RTW, FLA_Obj B, FLA_Obj X );

// end FLA_CAQR_UT_inc.h


// Other Decompositions
// begin FLA_Hevd.h


// begin FLA_Hevd_ln.h


FLA_Error FLA_Hevd_ln_unb_var1( FLA_Obj A, FLA_Obj l );
// end FLA_Hevd_ln.h
// begin FLA_Hevd_lv.h


FLA_Error FLA_Hevd_lv_unb_var1( dim_t n_iter_max, FLA_Obj A, FLA_Obj l, dim_t k_accum, dim_t b_alg );
FLA_Error FLA_Hevd_lv_unb_var2( dim_t n_iter_max, FLA_Obj A, FLA_Obj l, dim_t k_accum, dim_t b_alg );

// end FLA_Hevd_lv.h
//#include "FLA_Hevd_un.h"
//#include "FLA_Hevd_uv.h"

FLA_Error FLA_Hevd_compute_scaling( FLA_Uplo uplo, FLA_Obj A, FLA_Obj sigma );

FLA_Error FLA_Hevd( FLA_Evd_type jobz, FLA_Uplo uplo, FLA_Obj A, FLA_Obj l );

// end FLA_Hevd.h
// begin FLA_Tevd.h


// begin FLA_Tevd_n.h


// begin FLA_Tevd_iteracc_n.h


// --- FLA_Tevd_iteracc_n_opt_var1() -------------------------------------------

FLA_Error FLA_Tevd_iteracc_n_ops_var1( int       m_A,
                                       int       n_G,
                                       int       ijTL,
                                       float*    buff_d, int inc_d, 
                                       float*    buff_e, int inc_e,
                                       int*      n_iter_perf );
FLA_Error FLA_Tevd_iteracc_n_opd_var1( int       m_A,
                                       int       n_G,
                                       int       ijTL,
                                       double*   buff_d, int inc_d, 
                                       double*   buff_e, int inc_e,
                                       int*      n_iter_perf );

// end FLA_Tevd_iteracc_n.h
// begin FLA_Tevd_eigval_n.h


// --- FLA_Tevd_eigval_n_opt_var1() --------------------------------------------

FLA_Error FLA_Tevd_eigval_n_opt_var1( FLA_Obj G, FLA_Obj d, FLA_Obj e, FLA_Obj n_iter );
FLA_Error FLA_Tevd_eigval_n_ops_var1( int       m_A,
                                      int       n_G,
                                      float*    buff_d, int inc_d, 
                                      float*    buff_e, int inc_e,
                                      int*      n_iter );
FLA_Error FLA_Tevd_eigval_n_opd_var1( int       m_A,
                                      int       n_G,
                                      double*   buff_d, int inc_d, 
                                      double*   buff_e, int inc_e,
                                      int*      n_iter );

// end FLA_Tevd_eigval_n.h
// begin FLA_Tevd_francis_n.h


// --- FLA_Tevd_francis_n_opt_var1() -------------------------------------------

FLA_Error FLA_Tevd_francis_n_opt_var1( FLA_Obj shift, FLA_Obj d, FLA_Obj e );
FLA_Error FLA_Tevd_francis_n_ops_var1( int       m_A,
                                       float*    buff_shift,
                                       float*    buff_d, int inc_d, 
                                       float*    buff_e, int inc_e ); 
FLA_Error FLA_Tevd_francis_n_opd_var1( int       m_A,
                                       double*   buff_shift,
                                       double*   buff_d, int inc_d, 
                                       double*   buff_e, int inc_e ); 

// end FLA_Tevd_francis_n.h

// --- FLA_Tevd_find_submatrix() -----------------------------------------------

FLA_Error FLA_Tevd_find_submatrix_ops( int       m_A,
                                       int       ij_begin,
                                       float*    buff_d, int inc_d, 
                                       float*    buff_e, int inc_e,
                                       int*      ijTL,
                                       int*      ijBR );
FLA_Error FLA_Tevd_find_submatrix_opd( int       m_A,
                                       int       ij_begin,
                                       double*   buff_d, int inc_d, 
                                       double*   buff_e, int inc_e,
                                       int*      ijTL,
                                       int*      ijBR );

// --- FLA_Norm1_tridiag() -----------------------------------------------------

FLA_Error FLA_Norm1_tridiag( FLA_Obj d, FLA_Obj e, FLA_Obj norm );
FLA_Error FLA_Norm1_tridiag_ops( int       m_A,
                                 float*    buff_d, int inc_d, 
                                 float*    buff_e, int inc_e,
                                 float*    norm );
FLA_Error FLA_Norm1_tridiag_opd( int       m_A,
                                 double*   buff_d, int inc_d, 
                                 double*   buff_e, int inc_e,
                                 double*   norm );

// --- FLA_Tevd_n_opt_var1() ---------------------------------------------------

FLA_Error FLA_Tevd_n_opt_var1( dim_t n_iter_max, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj U );
FLA_Error FLA_Tevd_n_ops_var1( int       m_A,
                               int       m_U,
                               int       n_G,
                               int       n_iter_max,
                               float*    buff_d, int inc_d, 
                               float*    buff_e, int inc_e,
                               scomplex* buff_G, int rs_G, int cs_G );
FLA_Error FLA_Tevd_n_opd_var1( int       m_A,
                               int       m_U,
                               int       n_G,
                               int       n_iter_max,
                               double*   buff_d, int inc_d, 
                               double*   buff_e, int inc_e,
                               dcomplex* buff_G, int rs_G, int cs_G );
FLA_Error FLA_Tevd_n_opc_var1( int       m_A,
                               int       m_U,
                               int       n_G,
                               int       n_iter_max,
                               float*    buff_d, int inc_d, 
                               float*    buff_e, int inc_e,
                               scomplex* buff_G, int rs_G, int cs_G );
FLA_Error FLA_Tevd_n_opz_var1( int       m_A,
                               int       m_U,
                               int       n_G,
                               int       n_iter_max,
                               double*   buff_d, int inc_d, 
                               double*   buff_e, int inc_e,
                               dcomplex* buff_G, int rs_G, int cs_G );


// end FLA_Tevd_n.h
// begin FLA_Tevd_v.h


// begin FLA_Tevd_iteracc_v.h


// --- FLA_Tevd_iteracc_v_opt_var1() -------------------------------------------

FLA_Error FLA_Tevd_iteracc_v_ops_var1( int       m_A,
                                       int       n_G,
                                       int       ijTL,
                                       float*    buff_d, int inc_d, 
                                       float*    buff_e, int inc_e,
                                       scomplex* buff_G, int rs_G, int cs_G,
                                       int*      n_iter_perf );
FLA_Error FLA_Tevd_iteracc_v_opd_var1( int       m_A,
                                       int       n_G,
                                       int       ijTL,
                                       double*   buff_d, int inc_d, 
                                       double*   buff_e, int inc_e,
                                       dcomplex* buff_G, int rs_G, int cs_G,
                                       int*      n_iter_perf );

FLA_Error FLA_Tevd_iteracc_v_ops_var3( int       m_A,
                                       int       m_U,
                                       int       n_G,
                                       int       ijTL,
                                       float*    buff_d, int inc_d, 
                                       float*    buff_e, int inc_e,
                                       float*    buff_l, int inc_l,
                                       int*      buff_ls, int inc_ls,
                                       float*    buff_pu, int inc_pu,
                                       scomplex* buff_G, int rs_G, int cs_G,
                                       int*      n_iter_perf );
FLA_Error FLA_Tevd_iteracc_v_opd_var3( int       m_A,
                                       int       m_U,
                                       int       n_G,
                                       int       ijTL,
                                       double*   buff_d, int inc_d, 
                                       double*   buff_e, int inc_e,
                                       double*   buff_l, int inc_l,
                                       int*      buff_ls, int inc_ls,
                                       double*   buff_pu, int inc_pu,
                                       dcomplex* buff_G, int rs_G, int cs_G,
                                       int*      n_iter_perf );

// end FLA_Tevd_iteracc_v.h
// begin FLA_Tevd_eigval_v.h


// --- FLA_Tevd_eigval_v_opt_var1() --------------------------------------------

FLA_Error FLA_Tevd_eigval_v_opt_var1( FLA_Obj G, FLA_Obj d, FLA_Obj e, FLA_Obj n_iter );
FLA_Error FLA_Tevd_eigval_v_ops_var1( int       m_A,
                                      int       n_G,
                                      scomplex* buff_G, int rs_G, int cs_G,
                                      float*    buff_d, int inc_d, 
                                      float*    buff_e, int inc_e,
                                      int*      n_iter );
FLA_Error FLA_Tevd_eigval_v_opd_var1( int       m_A,
                                      int       n_G,
                                      dcomplex* buff_G, int rs_G, int cs_G,
                                      double*   buff_d, int inc_d, 
                                      double*   buff_e, int inc_e,
                                      int*      n_iter );

FLA_Error FLA_Tevd_eigval_v_ops_var3( int       m_A,
                                      int       m_U,
                                      int       n_G,
                                      scomplex* buff_G, int rs_G, int cs_G,
                                      float*    buff_d, int inc_d, 
                                      float*    buff_e, int inc_e,
                                      float*    buff_l, int inc_l,
                                      int*      buff_ls, int inc_ls,
                                      float*    buff_pu, int inc_pu,
                                      int*      n_iter );
FLA_Error FLA_Tevd_eigval_v_opd_var3( int       m_A,
                                      int       m_U,
                                      int       n_G,
                                      dcomplex* buff_G, int rs_G, int cs_G,
                                      double*   buff_d, int inc_d, 
                                      double*   buff_e, int inc_e,
                                      double*   buff_l, int inc_l,
                                      int*      buff_ls, int inc_ls,
                                      double*   buff_pu, int inc_pu,
                                      int*      n_iter );

// end FLA_Tevd_eigval_v.h
// begin FLA_Tevd_francis_v.h


// --- FLA_Tevd_francis_v_opt_var1() -------------------------------------------

FLA_Error FLA_Tevd_francis_v_opt_var1( FLA_Obj shift, FLA_Obj g, FLA_Obj d, FLA_Obj e );
FLA_Error FLA_Tevd_francis_v_ops_var1( int       m_A,
                                       float*    buff_shift,
                                       scomplex* buff_g, int inc_g, 
                                       float*    buff_d, int inc_d, 
                                       float*    buff_e, int inc_e ); 
FLA_Error FLA_Tevd_francis_v_opd_var1( int       m_A,
                                       double*   buff_shift,
                                       dcomplex* buff_g, int inc_g, 
                                       double*   buff_d, int inc_d, 
                                       double*   buff_e, int inc_e ); 

// end FLA_Tevd_francis_v.h

// --- FLA_Tevd_compute_scaling() ----------------------------------------------

FLA_Error FLA_Tevd_compute_scaling_ops( int       m_A,
                                        float*    buff_d, int inc_d, 
                                        float*    buff_e, int inc_e,
                                        float*    sigma );
FLA_Error FLA_Tevd_compute_scaling_opd( int       m_A,
                                        double*   buff_d, int inc_d, 
                                        double*   buff_e, int inc_e,
                                        double*   sigma );

// --- FLA_Tevd_find_submatrix() -----------------------------------------------

FLA_Error FLA_Tevd_find_submatrix_ops( int       m_A,
                                       int       ij_begin,
                                       float*    buff_d, int inc_d, 
                                       float*    buff_e, int inc_e,
                                       int*      ijTL,
                                       int*      ijBR );
FLA_Error FLA_Tevd_find_submatrix_opd( int       m_A,
                                       int       ij_begin,
                                       double*   buff_d, int inc_d, 
                                       double*   buff_e, int inc_e,
                                       int*      ijTL,
                                       int*      ijBR );

// --- FLA_Tevd_find_perfshift() -----------------------------------------------

FLA_Error FLA_Tevd_find_perfshift_ops( int       m_d,
                                       int       m_l,
                                       float*    buff_d, int inc_d, 
                                       float*    buff_e, int inc_e, 
                                       float*    buff_l, int inc_l, 
                                       int*      buff_lstat, int inc_lstat, 
                                       float*    buff_pu, int inc_pu, 
                                       int*      ij_shift );
FLA_Error FLA_Tevd_find_perfshift_opd( int       m_d,
                                       int       m_l,
                                       double*   buff_d, int inc_d, 
                                       double*   buff_e, int inc_e, 
                                       double*   buff_l, int inc_l, 
                                       int*      buff_lstat, int inc_lstat, 
                                       double*   buff_pu, int inc_pu, 
                                       int*      ij_shift );

// --- FLA_Norm1_tridiag() -----------------------------------------------------

FLA_Error FLA_Norm1_tridiag( FLA_Obj d, FLA_Obj e, FLA_Obj norm );
FLA_Error FLA_Norm1_tridiag_ops( int       m_A,
                                 float*    buff_d, int inc_d, 
                                 float*    buff_e, int inc_e,
                                 float*    norm );
FLA_Error FLA_Norm1_tridiag_opd( int       m_A,
                                 double*   buff_d, int inc_d, 
                                 double*   buff_e, int inc_e,
                                 double*   norm );

// --- FLA_Tevd_v_opt_var1() ---------------------------------------------------

FLA_Error FLA_Tevd_v_opt_var1( dim_t n_iter_max, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj U, dim_t b_alg );
FLA_Error FLA_Tevd_v_ops_var1( int       m_A,
                               int       m_U,
                               int       n_G,
                               int       n_iter_max,
                               float*    buff_d, int inc_d, 
                               float*    buff_e, int inc_e,
                               scomplex* buff_G, int rs_G, int cs_G,
                               float*    buff_U, int rs_U, int cs_U,
                               int       b_alg );
FLA_Error FLA_Tevd_v_opd_var1( int       m_A,
                               int       m_U,
                               int       n_G,
                               int       n_iter_max,
                               double*   buff_d, int inc_d, 
                               double*   buff_e, int inc_e,
                               dcomplex* buff_G, int rs_G, int cs_G,
                               double*   buff_U, int rs_U, int cs_U,
                               int       b_alg );
FLA_Error FLA_Tevd_v_opc_var1( int       m_A,
                               int       m_U,
                               int       n_G,
                               int       n_iter_max,
                               float*    buff_d, int inc_d, 
                               float*    buff_e, int inc_e,
                               scomplex* buff_G, int rs_G, int cs_G,
                               scomplex* buff_U, int rs_U, int cs_U,
                               int       b_alg );
FLA_Error FLA_Tevd_v_opz_var1( int       m_A,
                               int       m_U,
                               int       n_G,
                               int       n_iter_max,
                               double*   buff_d, int inc_d, 
                               double*   buff_e, int inc_e,
                               dcomplex* buff_G, int rs_G, int cs_G,
                               dcomplex* buff_U, int rs_U, int cs_U,
                               int       b_alg );

// --- FLA_Tevd_v_opt_var2() ---------------------------------------------------

FLA_Error FLA_Tevd_v_opt_var2( dim_t n_iter_max, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj R, FLA_Obj W, FLA_Obj U, dim_t b_alg );
FLA_Error FLA_Tevd_v_ops_var2( int       m_A,
                               int       m_U,
                               int       n_G,
                               int       n_G_extra,
                               float*    buff_d, int inc_d, 
                               float*    buff_e, int inc_e,
                               scomplex* buff_G, int rs_G, int cs_G,
                               float*    buff_R, int rs_R, int cs_R,
                               float*    buff_W, int rs_W, int cs_W,
                               float*    buff_U, int rs_U, int cs_U,
                               int       b_alg );
FLA_Error FLA_Tevd_v_opd_var2( int       m_A,
                               int       m_U,
                               int       n_G,
                               int       n_G_extra,
                               double*   buff_d, int inc_d, 
                               double*   buff_e, int inc_e,
                               dcomplex* buff_G, int rs_G, int cs_G,
                               double*   buff_R, int rs_R, int cs_R,
                               double*   buff_W, int rs_W, int cs_W,
                               double*   buff_U, int rs_U, int cs_U,
                               int       b_alg );
FLA_Error FLA_Tevd_v_opc_var2( int       m_A,
                               int       m_U,
                               int       n_G,
                               int       n_G_extra,
                               float*    buff_d, int inc_d, 
                               float*    buff_e, int inc_e,
                               scomplex* buff_G, int rs_G, int cs_G,
                               float*    buff_R, int rs_R, int cs_R,
                               scomplex* buff_W, int rs_W, int cs_W,
                               scomplex* buff_U, int rs_U, int cs_U,
                               int       b_alg );
FLA_Error FLA_Tevd_v_opz_var2( int       m_A,
                               int       m_U,
                               int       n_G,
                               int       n_G_extra,
                               double*   buff_d, int inc_d, 
                               double*   buff_e, int inc_e,
                               dcomplex* buff_G, int rs_G, int cs_G,
                               double*   buff_R, int rs_R, int cs_R,
                               dcomplex* buff_W, int rs_W, int cs_W,
                               dcomplex* buff_U, int rs_U, int cs_U,
                               int       b_alg );

// end FLA_Tevd_v.h

// --- MAC_Tevd_eigval_converged() ---------------------------------------------

#define MAC_Tevd_eigval_converged_ops( eps, safmin, d1, e1, d2 ) \
	fabsf( e1 ) <= (eps) * sqrt( fabsf( d1 ) ) * sqrt( fabsf( d2 ) ) + (safmin)

#define MAC_Tevd_eigval_converged_opd( eps, safmin, d1, e1, d2 ) \
	fabs( e1 )  <= (eps) * sqrt( fabs( d1 ) )  * sqrt( fabs( d2 ) )  + (safmin)

// --- MAC_Tevd_eigval_converged2() ---------------------------------------------

#define MAC_Tevd_eigval_converged2_ops( eps2, safmin, d1, e1, d2 ) \
	(e1) * (e1) <=        (eps2) * fabsf( (d1) * (d2) ) + (safmin)

#define MAC_Tevd_eigval_converged2_opd( eps2, safmin, d1, e1, d2 ) \
	(e1) * (e1) <=        (eps2) * fabs( (d1) * (d2) ) + (safmin)

FLA_Error FLA_Tevd( FLA_Evd_type jobz, FLA_Obj U, FLA_Obj d, FLA_Obj e, FLA_Obj l );

// end FLA_Tevd.h
// begin FLA_Svd.h


// begin FLA_Svd_ext.h


FLA_Error FLA_Svd_ext_u_unb_var1( FLA_Svd_type jobu, FLA_Svd_type jobv, 
                                  dim_t n_iter_max,
                                  FLA_Obj A, FLA_Obj s, FLA_Obj V, FLA_Obj U,
                                  dim_t k_accum,
                                  dim_t b_alg );
// end FLA_Svd_ext.h
// begin FLA_Svd_uv.h


FLA_Error FLA_Svd_uv_unb_var1( dim_t n_iter_max, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, dim_t k_accum, dim_t b_alg );
FLA_Error FLA_Svd_uv_unb_var2( dim_t n_iter_max, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V, dim_t k_accum, dim_t b_alg );
// end FLA_Svd_uv.h

FLA_Error FLA_Svd_compute_scaling( FLA_Obj A, FLA_Obj sigma );

FLA_Error FLA_Svd( FLA_Svd_type jobu, FLA_Svd_type jobv, FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V );
FLA_Error FLA_Svd_ext( FLA_Svd_type jobu, FLA_Trans transu,
                       FLA_Svd_type jobv, FLA_Trans transv,
                       FLA_Obj A, FLA_Obj s, FLA_Obj U, FLA_Obj V );
// end FLA_Svd.h
// begin FLA_Bsvd.h


// begin FLA_Bsvd_n.h



// end FLA_Bsvd_n.h
// begin FLA_Bsvd_v.h


// begin FLA_Bsvd_iteracc_v.h


// --- FLA_Bsvd_iteracc_v_opt_var1() -------------------------------------------

FLA_Error FLA_Bsvd_iteracc_v_ops_var1( int       m_A,
                                       int       n_GH,
                                       int       ijTL,
                                       float     tol,
                                       float     thresh,
                                       float*    buff_d, int inc_d, 
                                       float*    buff_e, int inc_e,
                                       scomplex* buff_G, int rs_G, int cs_G,
                                       scomplex* buff_H, int rs_H, int cs_H,
                                       int*      n_iter_perf );
FLA_Error FLA_Bsvd_iteracc_v_opd_var1( int       m_A,
                                       int       n_GH,
                                       int       ijTL,
                                       double    tol,
                                       double    thresh,
                                       double*   buff_d, int inc_d, 
                                       double*   buff_e, int inc_e,
                                       dcomplex* buff_G, int rs_G, int cs_G,
                                       dcomplex* buff_H, int rs_H, int cs_H,
                                       int*      n_iter_perf );
// end FLA_Bsvd_iteracc_v.h
// begin FLA_Bsvd_sinval_v.h


// --- MAC_Bsvd_sinval_is_converged() ------------------------------------------

#define MAC_Bsvd_sinval_is_converged_ops( tol, d1, e1 ) \
	fabsf( (e1) ) <= fabsf( (tol) * (d1) )

#define MAC_Bsvd_sinval_is_converged_opd( tol, d1, e1 ) \
	fabs(  (e1) ) <= fabs(  (tol) * (d1) )

// --- FLA_Bsvd_sinval_v_opt_var1() --------------------------------------------

FLA_Error FLA_Bsvd_sinval_v_opt_var1( FLA_Obj tol, FLA_Obj thresh, FLA_Obj G, FLA_Obj H, FLA_Obj d, FLA_Obj e, FLA_Obj n_iter );
FLA_Error FLA_Bsvd_sinval_v_ops_var1( int       m_A,
                                      int       n_GH,
                                      int       n_iter_allowed,
                                      float     tol, 
                                      float     thresh, 
                                      scomplex* buff_G, int rs_G, int cs_G,
                                      scomplex* buff_H, int rs_H, int cs_H,
                                      float*    buff_d, int inc_d, 
                                      float*    buff_e, int inc_e,
                                      int*      n_iter );
FLA_Error FLA_Bsvd_sinval_v_opd_var1( int       m_A,
                                      int       n_GH,
                                      int       n_iter_allowed,
                                      double    tol, 
                                      double    thresh, 
                                      dcomplex* buff_G, int rs_G, int cs_G,
                                      dcomplex* buff_H, int rs_H, int cs_H,
                                      double*   buff_d, int inc_d, 
                                      double*   buff_e, int inc_e,
                                      int*      n_iter );

// end FLA_Bsvd_sinval_v.h
// begin FLA_Bsvd_francis_v.h


// --- FLA_Bsvd_francis_v_opt_var1() -------------------------------------------

FLA_Error FLA_Bsvd_francis_v_opt_var1( FLA_Obj shift, FLA_Obj g, FLA_Obj h, FLA_Obj d, FLA_Obj e );
FLA_Error FLA_Bsvd_francis_v_ops_var1( int       m_A,
                                       float     shift,
                                       scomplex* buff_g, int inc_g, 
                                       scomplex* buff_h, int inc_h, 
                                       float*    buff_d, int inc_d, 
                                       float*    buff_e, int inc_e ); 
FLA_Error FLA_Bsvd_francis_v_opd_var1( int       m_A,
                                       double    shift,
                                       dcomplex* buff_g, int inc_g, 
                                       dcomplex* buff_h, int inc_h, 
                                       double*   buff_d, int inc_d, 
                                       double*   buff_e, int inc_e ); 

// end FLA_Bsvd_francis_v.h

// --- FLA_Bsvd_compute_shift() ------------------------------------------------

FLA_Error FLA_Bsvd_compute_shift( FLA_Obj tol, FLA_Obj sminl, FLA_Obj smax, FLA_Obj d, FLA_Obj e, FLA_Obj shift );
FLA_Error FLA_Bsvd_compute_shift_ops( int       m_A,
                                      float     tol,
                                      float     sminl,
                                      float     smax,
                                      float*    buff_d, int inc_d,
                                      float*    buff_e, int inc_e,
                                      float*    shift );
FLA_Error FLA_Bsvd_compute_shift_opd( int       m_A,
                                      double    tol,
                                      double    sminl,
                                      double    smax,
                                      double*   buff_d, int inc_d,
                                      double*   buff_e, int inc_e,
                                      double*   shift );

// --- FLA_Bsvd_compute_tol_thresh() -------------------------------------------

FLA_Error FLA_Bsvd_compute_tol_thresh( FLA_Obj tolmul, FLA_Obj maxit, FLA_Obj d, FLA_Obj e, FLA_Obj tol, FLA_Obj thresh );
FLA_Error FLA_Bsvd_compute_tol_thresh_ops( int       m_A,
                                           float     tolmul,
                                           float     maxit,
                                           float*    buff_d, int inc_d, 
                                           float*    buff_e, int inc_e, 
                                           float*    tol,
                                           float*    thresh );
FLA_Error FLA_Bsvd_compute_tol_thresh_opd( int       m_A,
                                           double    tolmul,
                                           double    maxit,
                                           double*   buff_d, int inc_d, 
                                           double*   buff_e, int inc_e, 
                                           double*   tol,
                                           double*   thresh );

// --- FLA_Bsvd_find_converged() -----------------------------------------------

FLA_Error FLA_Bsvd_find_converged( FLA_Obj tol, FLA_Obj d, FLA_Obj e, FLA_Obj sminl );
FLA_Error FLA_Bsvd_find_converged_ops( int       m_A,
                                       float     tol, 
                                       float*    buff_d, int inc_d, 
                                       float*    buff_e, int inc_e,
                                       float*    sminl );
FLA_Error FLA_Bsvd_find_converged_opd( int       m_A,
                                       double    tol, 
                                       double*   buff_d, int inc_d, 
                                       double*   buff_e, int inc_e,
                                       double*   sminl );

// --- FLA_Bsvd_find_max_min() -------------------------------------------------

FLA_Error FLA_Bsvd_find_max_min( FLA_Obj d, FLA_Obj e, FLA_Obj smax, FLA_Obj smin );
FLA_Error FLA_Bsvd_find_max_min_ops( int       m_A,
                                     float*    buff_d, int inc_d, 
                                     float*    buff_e, int inc_e, 
                                     float*    smax,
                                     float*    smin );
FLA_Error FLA_Bsvd_find_max_min_opd( int       m_A,
                                     double*   buff_d, int inc_d, 
                                     double*   buff_e, int inc_e, 
                                     double*   smax,
                                     double*   smin );

// --- FLA_Bsvd_find_submatrix() -----------------------------------------------

FLA_Error FLA_Bsvd_find_submatrix_ops( int       mn_A,
                                       int       ij_begin,
                                       float*    buff_d, int inc_d,
                                       float*    buff_e, int inc_e,
                                       int*      ijTL,
                                       int*      ijBR );
FLA_Error FLA_Bsvd_find_submatrix_opd( int       mn_A,
                                       int       ij_begin,
                                       double*   buff_d, int inc_d,
                                       double*   buff_e, int inc_e,
                                       int*      ijTL,
                                       int*      ijBR );

// --- FLA_Bsvd_v_opt_var1() ---------------------------------------------------

FLA_Error FLA_Bsvd_v_opt_var1( dim_t n_iter_max, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj H, FLA_Obj U, FLA_Obj V, dim_t b_alg );
FLA_Error FLA_Bsvd_v_ops_var1( int       min_m_n,
                               int       m_U,
                               int       m_V,
                               int       n_GH,
                               int       n_iter_max,
                               float*    buff_d, int inc_d, 
                               float*    buff_e, int inc_e,
                               scomplex* buff_G, int rs_G, int cs_G,
                               scomplex* buff_H, int rs_H, int cs_H,
                               float*    buff_U, int rs_U, int cs_U,
                               float*    buff_V, int rs_V, int cs_V,
                               int       b_alg );
FLA_Error FLA_Bsvd_v_opd_var1( int       min_m_n,
                               int       m_U,
                               int       m_V,
                               int       n_GH,
                               int       n_iter_max,
                               double*   buff_d, int inc_d, 
                               double*   buff_e, int inc_e,
                               dcomplex* buff_G, int rs_G, int cs_G,
                               dcomplex* buff_H, int rs_H, int cs_H,
                               double*   buff_U, int rs_U, int cs_U,
                               double*   buff_V, int rs_V, int cs_V,
                               int       b_alg );
FLA_Error FLA_Bsvd_v_opc_var1( int       min_m_n,
                               int       m_U,
                               int       m_V,
                               int       n_GH,
                               int       n_iter_max,
                               float*    buff_d, int inc_d, 
                               float*    buff_e, int inc_e,
                               scomplex* buff_G, int rs_G, int cs_G,
                               scomplex* buff_H, int rs_H, int cs_H,
                               scomplex* buff_U, int rs_U, int cs_U,
                               scomplex* buff_V, int rs_V, int cs_V,
                               int       b_alg );
FLA_Error FLA_Bsvd_v_opz_var1( int       min_m_n,
                               int       m_U,
                               int       m_V,
                               int       n_GH,
                               int       n_iter_max,
                               double*   buff_d, int inc_d, 
                               double*   buff_e, int inc_e,
                               dcomplex* buff_G, int rs_G, int cs_G,
                               dcomplex* buff_H, int rs_H, int cs_H,
                               dcomplex* buff_U, int rs_U, int cs_U,
                               dcomplex* buff_V, int rs_V, int cs_V,
                               int       b_alg );

// --- FLA_Bsvd_v_opt_var2() ---------------------------------------------------

FLA_Error FLA_Bsvd_v_opt_var2( dim_t n_iter_max, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj H, FLA_Obj RG, FLA_Obj RH, FLA_Obj W, FLA_Obj U, FLA_Obj V, dim_t b_alg );
FLA_Error FLA_Bsvd_v_ops_var2( int       min_m_n,
                               int       m_U,
                               int       m_V,
                               int       n_GH,
                               int       n_iter_max,
                               float*    buff_d, int inc_d, 
                               float*    buff_e, int inc_e,
                               scomplex* buff_G, int rs_G, int cs_G,
                               scomplex* buff_H, int rs_H, int cs_H,
                               float*    buff_RG, int rs_RG, int cs_RG,
                               float*    buff_RH, int rs_RH, int cs_RH,
                               float*    buff_W, int rs_W, int cs_W,
                               float*    buff_U, int rs_U, int cs_U,
                               float*    buff_V, int rs_V, int cs_V,
                               int       b_alg );
FLA_Error FLA_Bsvd_v_opd_var2( int       min_m_n,
                               int       m_U,
                               int       m_V,
                               int       n_GH,
                               int       n_iter_max,
                               double*   buff_d, int inc_d, 
                               double*   buff_e, int inc_e,
                               dcomplex* buff_G, int rs_G, int cs_G,
                               dcomplex* buff_H, int rs_H, int cs_H,
                               double*   buff_RG, int rs_RG, int cs_RG,
                               double*   buff_RH, int rs_RH, int cs_RH,
                               double*   buff_W, int rs_W, int cs_W,
                               double*   buff_U, int rs_U, int cs_U,
                               double*   buff_V, int rs_V, int cs_V,
                               int       b_alg );
FLA_Error FLA_Bsvd_v_opc_var2( int       min_m_n,
                               int       m_U,
                               int       m_V,
                               int       n_GH,
                               int       n_iter_max,
                               float*    buff_d, int inc_d, 
                               float*    buff_e, int inc_e,
                               scomplex* buff_G, int rs_G, int cs_G,
                               scomplex* buff_H, int rs_H, int cs_H,
                               float*    buff_RG, int rs_RG, int cs_RG,
                               float*    buff_RH, int rs_RH, int cs_RH,
                               scomplex* buff_W, int rs_W, int cs_W,
                               scomplex* buff_U, int rs_U, int cs_U,
                               scomplex* buff_V, int rs_V, int cs_V,
                               int       b_alg );
FLA_Error FLA_Bsvd_v_opz_var2( int       min_m_n,
                               int       m_U,
                               int       m_V,
                               int       n_GH,
                               int       n_iter_max,
                               double*   buff_d, int inc_d, 
                               double*   buff_e, int inc_e,
                               dcomplex* buff_G, int rs_G, int cs_G,
                               dcomplex* buff_H, int rs_H, int cs_H,
                               double*   buff_RG, int rs_RG, int cs_RG,
                               double*   buff_RH, int rs_RH, int cs_RH,
                               dcomplex* buff_W, int rs_W, int cs_W,
                               dcomplex* buff_U, int rs_U, int cs_U,
                               dcomplex* buff_V, int rs_V, int cs_V,
                               int       b_alg );

// end FLA_Bsvd_v.h
// begin FLA_Bsvd_ext.h


// --- FLA_Bsvd_ext_opt_var1() ---------------------------------------------------

FLA_Error FLA_Bsvd_ext_opt_var1( dim_t n_iter_max, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj H, 
                                 FLA_Svd_type jobu, FLA_Obj U, 
                                 FLA_Svd_type jobv, FLA_Obj V, 
                                 FLA_Bool apply_Uh2C, FLA_Obj C,
                                 dim_t b_alg );
FLA_Error FLA_Bsvd_ext_ops_var1( int       m_d,
                                 int       m_U,
                                 int       m_V,
                                 int       m_C,
                                 int       n_C,
                                 int       n_GH,
                                 int       n_iter_max,
                                 float*    buff_d, int inc_d, 
                                 float*    buff_e, int inc_e,
                                 scomplex* buff_G, int rs_G, int cs_G,
                                 scomplex* buff_H, int rs_H, int cs_H,
                                 float*    buff_U, int rs_U, int cs_U,
                                 float*    buff_V, int rs_V, int cs_V,
                                 float*    buff_C, int rs_C, int cs_C,
                                 int       b_alg );
FLA_Error FLA_Bsvd_ext_opd_var1( int       m_d,
                                 int       m_U,
                                 int       m_V,
                                 int       m_C,
                                 int       n_C,
                                 int       n_GH,
                                 int       n_iter_max,
                                 double*   buff_d, int inc_d, 
                                 double*   buff_e, int inc_e,
                                 dcomplex* buff_G, int rs_G, int cs_G,
                                 dcomplex* buff_H, int rs_H, int cs_H,
                                 double*   buff_U, int rs_U, int cs_U,
                                 double*   buff_V, int rs_V, int cs_V,
                                 double*   buff_C, int rs_C, int cs_C,
                                 int       b_alg );
FLA_Error FLA_Bsvd_ext_opc_var1( int       m_d,
                                 int       m_U,
                                 int       m_V,
                                 int       m_C,
                                 int       n_C,
                                 int       n_GH,
                                 int       n_iter_max,
                                 float*    buff_d, int inc_d, 
                                 float*    buff_e, int inc_e,
                                 scomplex* buff_G, int rs_G, int cs_G,
                                 scomplex* buff_H, int rs_H, int cs_H,
                                 scomplex* buff_U, int rs_U, int cs_U,
                                 scomplex* buff_V, int rs_V, int cs_V,
                                 scomplex* buff_C, int rs_C, int cs_C,
                                 int       b_alg );
FLA_Error FLA_Bsvd_ext_opz_var1( int       m_d,
                                 int       m_U,
                                 int       m_V,
                                 int       m_C,
                                 int       n_C,
                                 int       n_GH,
                                 int       n_iter_max,
                                 double*   buff_d, int inc_d, 
                                 double*   buff_e, int inc_e,
                                 dcomplex* buff_G, int rs_G, int cs_G,
                                 dcomplex* buff_H, int rs_H, int cs_H,
                                 dcomplex* buff_U, int rs_U, int cs_U,
                                 dcomplex* buff_V, int rs_V, int cs_V,
                                 dcomplex* buff_C, int rs_C, int cs_C,
                                 int       b_alg );

// end FLA_Bsvd_ext.h

FLA_Error FLA_Bsvd_create_workspace( FLA_Obj d, FLA_Obj *G, FLA_Obj *H );
FLA_Error FLA_Bsvd( FLA_Uplo uplo, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj H, 
                    FLA_Svd_type jobu, FLA_Obj U, 
                    FLA_Svd_type jobv, FLA_Obj V );
FLA_Error FLA_Bsvd_ext( FLA_Uplo uplo, FLA_Obj d, FLA_Obj e, FLA_Obj G, FLA_Obj H,
                        FLA_Svd_type jobu, FLA_Obj U,
                        FLA_Svd_type jobv, FLA_Obj V,
                        FLA_Bool apply_Uh2C, FLA_Obj C );
// end FLA_Bsvd.h

// Inversions
// begin FLA_Trinv.h


// begin FLA_Trinv_ln.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trinv_ln_blk_var1( FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Trinv_ln_blk_var2( FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Trinv_ln_blk_var3( FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Trinv_ln_blk_var4( FLA_Obj A, fla_trinv_t* cntl );

FLA_Error FLA_Trinv_ln_unb_var1( FLA_Obj A );
FLA_Error FLA_Trinv_ln_unb_var2( FLA_Obj A );
FLA_Error FLA_Trinv_ln_unb_var3( FLA_Obj A );
FLA_Error FLA_Trinv_ln_unb_var4( FLA_Obj A );

FLA_Error FLA_Trinv_ln_opt_var1( FLA_Obj A );
FLA_Error FLA_Trinv_ln_ops_var1( int mn_A,
                                 float*    A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_ln_opd_var1( int mn_A,
                                 double*   A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_ln_opc_var1( int mn_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_ln_opz_var1( int mn_A,
                                 dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Trinv_ln_opt_var2( FLA_Obj A );
FLA_Error FLA_Trinv_ln_ops_var2( int mn_A,
                                 float*    A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_ln_opd_var2( int mn_A,
                                 double*   A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_ln_opc_var2( int mn_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_ln_opz_var2( int mn_A,
                                 dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Trinv_ln_opt_var3( FLA_Obj A );
FLA_Error FLA_Trinv_ln_ops_var3( int mn_A,
                                 float*    A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_ln_opd_var3( int mn_A,
                                 double*   A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_ln_opc_var3( int mn_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_ln_opz_var3( int mn_A,
                                 dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Trinv_ln_opt_var4( FLA_Obj A );
FLA_Error FLA_Trinv_ln_ops_var4( int mn_A,
                                 float*    A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_ln_opd_var4( int mn_A,
                                 double*   A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_ln_opc_var4( int mn_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_ln_opz_var4( int mn_A,
                                 dcomplex* A, int rs_A, int cs_A );
// end FLA_Trinv_ln.h
// begin FLA_Trinv_lu.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trinv_lu_blk_var1( FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Trinv_lu_blk_var2( FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Trinv_lu_blk_var3( FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Trinv_lu_blk_var4( FLA_Obj A, fla_trinv_t* cntl );

FLA_Error FLA_Trinv_lu_unb_var1( FLA_Obj A );
FLA_Error FLA_Trinv_lu_unb_var2( FLA_Obj A );
FLA_Error FLA_Trinv_lu_unb_var3( FLA_Obj A );
FLA_Error FLA_Trinv_lu_unb_var4( FLA_Obj A );

FLA_Error FLA_Trinv_lu_opt_var1( FLA_Obj A );
FLA_Error FLA_Trinv_lu_ops_var1( int mn_A,
                                 float*    A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_lu_opd_var1( int mn_A,
                                 double*   A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_lu_opc_var1( int mn_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_lu_opz_var1( int mn_A,
                                 dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Trinv_lu_opt_var2( FLA_Obj A );
FLA_Error FLA_Trinv_lu_ops_var2( int mn_A,
                                 float*    A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_lu_opd_var2( int mn_A,
                                 double*   A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_lu_opc_var2( int mn_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_lu_opz_var2( int mn_A,
                                 dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Trinv_lu_opt_var3( FLA_Obj A );
FLA_Error FLA_Trinv_lu_ops_var3( int mn_A,
                                 float*    A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_lu_opd_var3( int mn_A,
                                 double*   A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_lu_opc_var3( int mn_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_lu_opz_var3( int mn_A,
                                 dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Trinv_lu_opt_var4( FLA_Obj A );
FLA_Error FLA_Trinv_lu_ops_var4( int mn_A,
                                 float*    A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_lu_opd_var4( int mn_A,
                                 double*   A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_lu_opc_var4( int mn_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_lu_opz_var4( int mn_A,
                                 dcomplex* A, int rs_A, int cs_A );
// end FLA_Trinv_lu.h
// begin FLA_Trinv_un.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trinv_un_blk_var1( FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Trinv_un_blk_var2( FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Trinv_un_blk_var3( FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Trinv_un_blk_var4( FLA_Obj A, fla_trinv_t* cntl );

FLA_Error FLA_Trinv_un_unb_var1( FLA_Obj A );
FLA_Error FLA_Trinv_un_unb_var2( FLA_Obj A );
FLA_Error FLA_Trinv_un_unb_var3( FLA_Obj A );
FLA_Error FLA_Trinv_un_unb_var4( FLA_Obj A );

FLA_Error FLA_Trinv_un_opt_var1( FLA_Obj A );
FLA_Error FLA_Trinv_un_ops_var1( int mn_A,
                                 float*    A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_un_opd_var1( int mn_A,
                                 double*   A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_un_opc_var1( int mn_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_un_opz_var1( int mn_A,
                                 dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Trinv_un_opt_var2( FLA_Obj A );
FLA_Error FLA_Trinv_un_ops_var2( int mn_A,
                                 float*    A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_un_opd_var2( int mn_A,
                                 double*   A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_un_opc_var2( int mn_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_un_opz_var2( int mn_A,
                                 dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Trinv_un_opt_var3( FLA_Obj A );
FLA_Error FLA_Trinv_un_ops_var3( int mn_A,
                                 float*    A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_un_opd_var3( int mn_A,
                                 double*   A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_un_opc_var3( int mn_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_un_opz_var3( int mn_A,
                                 dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Trinv_un_opt_var4( FLA_Obj A );
FLA_Error FLA_Trinv_un_ops_var4( int mn_A,
                                 float*    A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_un_opd_var4( int mn_A,
                                 double*   A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_un_opc_var4( int mn_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_un_opz_var4( int mn_A,
                                 dcomplex* A, int rs_A, int cs_A );
// end FLA_Trinv_un.h
// begin FLA_Trinv_uu.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Trinv_uu_blk_var1( FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Trinv_uu_blk_var2( FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Trinv_uu_blk_var3( FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Trinv_uu_blk_var4( FLA_Obj A, fla_trinv_t* cntl );

FLA_Error FLA_Trinv_uu_unb_var1( FLA_Obj A );
FLA_Error FLA_Trinv_uu_unb_var2( FLA_Obj A );
FLA_Error FLA_Trinv_uu_unb_var3( FLA_Obj A );
FLA_Error FLA_Trinv_uu_unb_var4( FLA_Obj A );

FLA_Error FLA_Trinv_uu_opt_var1( FLA_Obj A );
FLA_Error FLA_Trinv_uu_ops_var1( int mn_A,
                                 float*    A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_uu_opd_var1( int mn_A,
                                 double*   A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_uu_opc_var1( int mn_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_uu_opz_var1( int mn_A,
                                 dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Trinv_uu_opt_var2( FLA_Obj A );
FLA_Error FLA_Trinv_uu_ops_var2( int mn_A,
                                 float*    A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_uu_opd_var2( int mn_A,
                                 double*   A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_uu_opc_var2( int mn_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_uu_opz_var2( int mn_A,
                                 dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Trinv_uu_opt_var3( FLA_Obj A );
FLA_Error FLA_Trinv_uu_ops_var3( int mn_A,
                                 float*    A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_uu_opd_var3( int mn_A,
                                 double*   A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_uu_opc_var3( int mn_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_uu_opz_var3( int mn_A,
                                 dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Trinv_uu_opt_var4( FLA_Obj A );
FLA_Error FLA_Trinv_uu_ops_var4( int mn_A,
                                 float*    A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_uu_opd_var4( int mn_A,
                                 double*   A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_uu_opc_var4( int mn_A,
                                 scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Trinv_uu_opz_var4( int mn_A,
                                 dcomplex* A, int rs_A, int cs_A );
// end FLA_Trinv_uu.h

FLA_Error FLA_Trinv_internal( FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A, fla_trinv_t* cntl );

FLA_Error FLA_Trinv_ln( FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Trinv_lu( FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Trinv_un( FLA_Obj A, fla_trinv_t* cntl );
FLA_Error FLA_Trinv_uu( FLA_Obj A, fla_trinv_t* cntl );

// end FLA_Trinv.h
// begin FLA_SPDinv.h


// skipped #include "FLAME.h" 

FLA_Error FLA_SPDinv_internal( FLA_Uplo uplo, FLA_Obj A, fla_spdinv_t* cntl );
// end FLA_SPDinv.h

// Reductions
// begin FLA_Hess_UT.h


// begin FLA_Hess_UT_vars.h


FLA_Error FLA_Hess_UT_blk_var1( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_unb_var1( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_unb_var1( FLA_Obj A, FLA_Obj T );

FLA_Error FLA_Hess_UT_blk_var2( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_blf_var2( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_unb_var2( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_unb_var2( FLA_Obj A, FLA_Obj T );

FLA_Error FLA_Hess_UT_blk_var3( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_blf_var3( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_unb_var3( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_unb_var3( FLA_Obj A, FLA_Obj T );

FLA_Error FLA_Hess_UT_blk_var4( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_blf_var4( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_unb_var4( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_unb_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj Z, FLA_Obj T );

FLA_Error FLA_Hess_UT_blk_var5( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_unb_var5( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_unb_var5( FLA_Obj A, FLA_Obj U, FLA_Obj Z, FLA_Obj T );


FLA_Error FLA_Hess_UT_opt_var1( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_opt_var1( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_ops_var1( int m_A,
                                     int m_T,
                                     float* buff_A, int rs_A, int cs_A, 
                                     float* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_opd_var1( int m_A,
                                     int m_T,
                                     double* buff_A, int rs_A, int cs_A, 
                                     double* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_opc_var1( int m_A,
                                     int m_T,
                                     scomplex* buff_A, int rs_A, int cs_A, 
                                     scomplex* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_opz_var1( int m_A,
                                     int m_T,
                                     dcomplex* buff_A, int rs_A, int cs_A, 
                                     dcomplex* buff_T, int rs_T, int cs_T );


FLA_Error FLA_Hess_UT_opt_var2( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_opt_var2( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_ops_var2( int m_A,
                                     int m_T,
                                     float* buff_A, int rs_A, int cs_A, 
                                     float* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_opd_var2( int m_A,
                                     int m_T,
                                     double* buff_A, int rs_A, int cs_A, 
                                     double* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_opc_var2( int m_A,
                                     int m_T,
                                     scomplex* buff_A, int rs_A, int cs_A, 
                                     scomplex* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_opz_var2( int m_A,
                                     int m_T,
                                     dcomplex* buff_A, int rs_A, int cs_A, 
                                     dcomplex* buff_T, int rs_T, int cs_T );


FLA_Error FLA_Hess_UT_opt_var3( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_opt_var3( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_ops_var3( int m_A,
                                     int m_T,
                                     float* buff_A, int rs_A, int cs_A, 
                                     float* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_opd_var3( int m_A,
                                     int m_T,
                                     double* buff_A, int rs_A, int cs_A, 
                                     double* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_opc_var3( int m_A,
                                     int m_T,
                                     scomplex* buff_A, int rs_A, int cs_A, 
                                     scomplex* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_opz_var3( int m_A,
                                     int m_T,
                                     dcomplex* buff_A, int rs_A, int cs_A, 
                                     dcomplex* buff_T, int rs_T, int cs_T );


FLA_Error FLA_Hess_UT_opt_var4( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_opt_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj Z, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_ops_var4( int m_A,
                                     int m_T,
                                     float* buff_A, int rs_A, int cs_A, 
                                     float* buff_Y, int rs_Y, int cs_Y, 
                                     float* buff_Z, int rs_Z, int cs_Z, 
                                     float* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_opd_var4( int m_A,
                                     int m_T,
                                     double* buff_A, int rs_A, int cs_A, 
                                     double* buff_Y, int rs_Y, int cs_Y, 
                                     double* buff_Z, int rs_Z, int cs_Z, 
                                     double* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_opc_var4( int m_A,
                                     int m_T,
                                     scomplex* buff_A, int rs_A, int cs_A, 
                                     scomplex* buff_Y, int rs_Y, int cs_Y, 
                                     scomplex* buff_Z, int rs_Z, int cs_Z, 
                                     scomplex* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_opz_var4( int m_A,
                                     int m_T,
                                     dcomplex* buff_A, int rs_A, int cs_A, 
                                     dcomplex* buff_Y, int rs_Y, int cs_Y, 
                                     dcomplex* buff_Z, int rs_Z, int cs_Z, 
                                     dcomplex* buff_T, int rs_T, int cs_T );


FLA_Error FLA_Hess_UT_opt_var5( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_opt_var5( FLA_Obj A, FLA_Obj U, FLA_Obj Z, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_ops_var5( int m_A,
                                     int m_T,
                                     float* buff_A, int rs_A, int cs_A, 
                                     float* buff_U, int rs_U, int cs_U, 
                                     float* buff_Z, int rs_Z, int cs_Z, 
                                     float* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_opd_var5( int m_A,
                                     int m_T,
                                     double* buff_A, int rs_A, int cs_A, 
                                     double* buff_U, int rs_U, int cs_U, 
                                     double* buff_Z, int rs_Z, int cs_Z, 
                                     double* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_opc_var5( int m_A,
                                     int m_T,
                                     scomplex* buff_A, int rs_A, int cs_A, 
                                     scomplex* buff_U, int rs_U, int cs_U, 
                                     scomplex* buff_Z, int rs_Z, int cs_Z, 
                                     scomplex* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_opz_var5( int m_A,
                                     int m_T,
                                     dcomplex* buff_A, int rs_A, int cs_A, 
                                     dcomplex* buff_U, int rs_U, int cs_U, 
                                     dcomplex* buff_Z, int rs_Z, int cs_Z, 
                                     dcomplex* buff_T, int rs_T, int cs_T );


FLA_Error FLA_Hess_UT_ofu_var1( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_ofu_var1( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_ofs_var1( int m_A,
                                     int m_T,
                                     float* buff_A, int rs_A, int cs_A, 
                                     float* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_ofd_var1( int m_A,
                                     int m_T,
                                     double* buff_A, int rs_A, int cs_A, 
                                     double* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_ofc_var1( int m_A,
                                     int m_T,
                                     scomplex* buff_A, int rs_A, int cs_A, 
                                     scomplex* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_ofz_var1( int m_A,
                                     int m_T,
                                     dcomplex* buff_A, int rs_A, int cs_A, 
                                     dcomplex* buff_T, int rs_T, int cs_T );


FLA_Error FLA_Hess_UT_ofu_var2( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_ofu_var2( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_ofs_var2( int m_A,
                                     int m_T,
                                     float* buff_A, int rs_A, int cs_A, 
                                     float* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_ofd_var2( int m_A,
                                     int m_T,
                                     double* buff_A, int rs_A, int cs_A, 
                                     double* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_ofc_var2( int m_A,
                                     int m_T,
                                     scomplex* buff_A, int rs_A, int cs_A, 
                                     scomplex* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_ofz_var2( int m_A,
                                     int m_T,
                                     dcomplex* buff_A, int rs_A, int cs_A, 
                                     dcomplex* buff_T, int rs_T, int cs_T );


FLA_Error FLA_Hess_UT_ofu_var3( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_ofu_var3( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_ofs_var3( int m_A,
                                     int m_T,
                                     float* buff_A, int rs_A, int cs_A, 
                                     float* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_ofd_var3( int m_A,
                                     int m_T,
                                     double* buff_A, int rs_A, int cs_A, 
                                     double* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_ofc_var3( int m_A,
                                     int m_T,
                                     scomplex* buff_A, int rs_A, int cs_A, 
                                     scomplex* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_ofz_var3( int m_A,
                                     int m_T,
                                     dcomplex* buff_A, int rs_A, int cs_A, 
                                     dcomplex* buff_T, int rs_T, int cs_T );


FLA_Error FLA_Hess_UT_ofu_var4( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_ofu_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj Z, FLA_Obj T );
FLA_Error FLA_Hess_UT_step_ofs_var4( int m_A,
                                     int m_T,
                                     float* buff_A, int rs_A, int cs_A, 
                                     float* buff_Y, int rs_Y, int cs_Y,
                                     float* buff_Z, int rs_Z, int cs_Z,
                                     float* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_ofd_var4( int m_A,
                                     int m_T,
                                     double* buff_A, int rs_A, int cs_A, 
                                     double* buff_Y, int rs_Y, int cs_Y,
                                     double* buff_Z, int rs_Z, int cs_Z,
                                     double* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_ofc_var4( int m_A,
                                     int m_T,
                                     scomplex* buff_A, int rs_A, int cs_A, 
                                     scomplex* buff_Y, int rs_Y, int cs_Y,
                                     scomplex* buff_Z, int rs_Z, int cs_Z,
                                     scomplex* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Hess_UT_step_ofz_var4( int m_A,
                                     int m_T,
                                     dcomplex* buff_A, int rs_A, int cs_A, 
                                     dcomplex* buff_Y, int rs_Y, int cs_Y,
                                     dcomplex* buff_Z, int rs_Z, int cs_Z,
                                     dcomplex* buff_T, int rs_T, int cs_T );


// --- Fused operations --------------------------------------------------------

FLA_Error FLA_Fused_Ahx_Ax_ops_var1( int m_A,
                                     int n_A,
                                     float* buff_A, int rs_A, int cs_A, 
                                     float* buff_x, int inc_x, 
                                     float* buff_v, int inc_v, 
                                     float* buff_w, int inc_w );
FLA_Error FLA_Fused_Ahx_Ax_opd_var1( int m_A,
                                     int n_A,
                                     double* buff_A, int rs_A, int cs_A, 
                                     double* buff_x, int inc_x, 
                                     double* buff_v, int inc_v, 
                                     double* buff_w, int inc_w );
FLA_Error FLA_Fused_Ahx_Ax_opc_var1( int m_A,
                                     int n_A,
                                     scomplex* buff_A, int rs_A, int cs_A, 
                                     scomplex* buff_x, int inc_x, 
                                     scomplex* buff_v, int inc_v, 
                                     scomplex* buff_w, int inc_w );
FLA_Error FLA_Fused_Ahx_Ax_opz_var1( int m_A,
                                     int n_A,
                                     dcomplex* buff_A, int rs_A, int cs_A, 
                                     dcomplex* buff_x, int inc_x, 
                                     dcomplex* buff_v, int inc_v, 
                                     dcomplex* buff_w, int inc_w );


FLA_Error FLA_Fused_Gerc2_Ahx_Ax_ops_var1( int m_A,
                                           int n_A,
                                           float* buff_alpha, 
                                           float* buff_u, int inc_u, 
                                           float* buff_y, int inc_y, 
                                           float* buff_z, int inc_z, 
                                           float* buff_A, int rs_A, int cs_A, 
                                           float* buff_x, int inc_x, 
                                           float* buff_v, int inc_v, 
                                           float* buff_w, int inc_w );
FLA_Error FLA_Fused_Gerc2_Ahx_Ax_opd_var1( int m_A,
                                           int n_A,
                                           double* buff_alpha, 
                                           double* buff_u, int inc_u, 
                                           double* buff_y, int inc_y, 
                                           double* buff_z, int inc_z, 
                                           double* buff_A, int rs_A, int cs_A, 
                                           double* buff_x, int inc_x, 
                                           double* buff_v, int inc_v, 
                                           double* buff_w, int inc_w );
FLA_Error FLA_Fused_Gerc2_Ahx_Ax_opc_var1( int m_A,
                                           int n_A,
                                           scomplex* buff_alpha, 
                                           scomplex* buff_u, int inc_u, 
                                           scomplex* buff_y, int inc_y, 
                                           scomplex* buff_z, int inc_z, 
                                           scomplex* buff_A, int rs_A, int cs_A, 
                                           scomplex* buff_x, int inc_x, 
                                           scomplex* buff_v, int inc_v, 
                                           scomplex* buff_w, int inc_w );
FLA_Error FLA_Fused_Gerc2_Ahx_Ax_opz_var1( int m_A,
                                           int n_A,
                                           dcomplex* buff_alpha, 
                                           dcomplex* buff_u, int inc_u, 
                                           dcomplex* buff_y, int inc_y, 
                                           dcomplex* buff_z, int inc_z, 
                                           dcomplex* buff_A, int rs_A, int cs_A, 
                                           dcomplex* buff_x, int inc_x, 
                                           dcomplex* buff_v, int inc_v, 
                                           dcomplex* buff_w, int inc_w );


FLA_Error FLA_Fused_Uhu_Yhu_Zhu_ops_var1( int m_U,
                                          int n_U,
                                          float* buff_delta,
                                          float* buff_U, int rs_U, int cs_U,
                                          float* buff_Y, int rs_Y, int cs_Y,
                                          float* buff_Z, int rs_Z, int cs_Z,
                                          float* buff_t, int inc_t,
                                          float* buff_u, int inc_u,
                                          float* buff_y, int inc_y,
                                          float* buff_z, int inc_z );
FLA_Error FLA_Fused_Uhu_Yhu_Zhu_opd_var1( int m_U,
                                          int n_U,
                                          double* buff_delta,
                                          double* buff_U, int rs_U, int cs_U,
                                          double* buff_Y, int rs_Y, int cs_Y,
                                          double* buff_Z, int rs_Z, int cs_Z,
                                          double* buff_t, int inc_t,
                                          double* buff_u, int inc_u,
                                          double* buff_y, int inc_y,
                                          double* buff_z, int inc_z );
FLA_Error FLA_Fused_Uhu_Yhu_Zhu_opc_var1( int m_U,
                                          int n_U,
                                          scomplex* buff_delta,
                                          scomplex* buff_U, int rs_U, int cs_U,
                                          scomplex* buff_Y, int rs_Y, int cs_Y,
                                          scomplex* buff_Z, int rs_Z, int cs_Z,
                                          scomplex* buff_t, int inc_t,
                                          scomplex* buff_u, int inc_u,
                                          scomplex* buff_y, int inc_y,
                                          scomplex* buff_z, int inc_z );
FLA_Error FLA_Fused_Uhu_Yhu_Zhu_opz_var1( int m_U,
                                          int n_U,
                                          dcomplex* buff_delta,
                                          dcomplex* buff_U, int rs_U, int cs_U,
                                          dcomplex* buff_Y, int rs_Y, int cs_Y,
                                          dcomplex* buff_Z, int rs_Z, int cs_Z,
                                          dcomplex* buff_t, int inc_t,
                                          dcomplex* buff_u, int inc_u,
                                          dcomplex* buff_y, int inc_y,
                                          dcomplex* buff_z, int inc_z );

// end FLA_Hess_UT_vars.h

FLA_Error FLA_Hess_UT( FLA_Obj A, FLA_Obj T );

FLA_Error FLA_Hess_UT_internal( FLA_Obj A, FLA_Obj T, fla_hessut_t* cntl );

FLA_Error FLA_Hess_UT_create_T( FLA_Obj A, FLA_Obj* T );

FLA_Error FLA_Hess_UT_recover_tau( FLA_Obj T, FLA_Obj t );
// end FLA_Hess_UT.h
// begin FLA_Tridiag_UT.h


// begin FLA_Tridiag_UT_l.h


FLA_Error FLA_Tridiag_UT_l_blk_var1( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_unb_var1( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_step_unb_var1( FLA_Obj A, FLA_Obj T );

FLA_Error FLA_Tridiag_UT_l_blk_var2( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_blf_var2( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_unb_var2( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_step_unb_var2( FLA_Obj A, FLA_Obj T );

FLA_Error FLA_Tridiag_UT_l_blk_var3( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_blf_var3( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_unb_var3( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_step_unb_var3( FLA_Obj A, FLA_Obj Z, FLA_Obj T );

FLA_Error FLA_Tridiag_UT_l_opt_var1( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_step_opt_var1( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_step_ops_var1( int m_A,
                                          int m_T,
                                          float* buff_A, int rs_A, int cs_A, 
                                          float* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_l_step_opd_var1( int m_A,
                                          int m_T,
                                          double* buff_A, int rs_A, int cs_A, 
                                          double* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_l_step_opc_var1( int m_A,
                                          int m_T,
                                          scomplex* buff_A, int rs_A, int cs_A, 
                                          scomplex* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_l_step_opz_var1( int m_A,
                                          int m_T,
                                          dcomplex* buff_A, int rs_A, int cs_A, 
                                          dcomplex* buff_T, int rs_T, int cs_T );

FLA_Error FLA_Tridiag_UT_l_opt_var2( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_step_opt_var2( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_step_ops_var2( int m_A,
                                          int m_T,
                                          float* buff_A, int rs_A, int cs_A, 
                                          float* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_l_step_opd_var2( int m_A,
                                          int m_T,
                                          double* buff_A, int rs_A, int cs_A, 
                                          double* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_l_step_opc_var2( int m_A,
                                          int m_T,
                                          scomplex* buff_A, int rs_A, int cs_A, 
                                          scomplex* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_l_step_opz_var2( int m_A,
                                          int m_T,
                                          dcomplex* buff_A, int rs_A, int cs_A, 
                                          dcomplex* buff_T, int rs_T, int cs_T );

FLA_Error FLA_Tridiag_UT_l_opt_var3( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_step_opt_var3( FLA_Obj A, FLA_Obj Z, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_step_ops_var3( int m_A,
                                          int m_T,
                                          float* buff_A, int rs_A, int cs_A, 
                                          float* buff_Z, int rs_Z, int cs_Z, 
                                          float* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_l_step_opd_var3( int m_A,
                                          int m_T,
                                          double* buff_A, int rs_A, int cs_A, 
                                          double* buff_Z, int rs_Z, int cs_Z, 
                                          double* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_l_step_opc_var3( int m_A,
                                          int m_T,
                                          scomplex* buff_A, int rs_A, int cs_A, 
                                          scomplex* buff_Z, int rs_Z, int cs_Z, 
                                          scomplex* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_l_step_opz_var3( int m_A,
                                          int m_T,
                                          dcomplex* buff_A, int rs_A, int cs_A, 
                                          dcomplex* buff_Z, int rs_Z, int cs_Z, 
                                          dcomplex* buff_T, int rs_T, int cs_T );

FLA_Error FLA_Tridiag_UT_l_ofu_var1( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_step_ofu_var1( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_step_ofs_var1( int m_A,
                                          int m_T,
                                          float* buff_A, int rs_A, int cs_A, 
                                          float* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_l_step_ofd_var1( int m_A,
                                          int m_T,
                                          double* buff_A, int rs_A, int cs_A, 
                                          double* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_l_step_ofc_var1( int m_A,
                                          int m_T,
                                          scomplex* buff_A, int rs_A, int cs_A, 
                                          scomplex* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_l_step_ofz_var1( int m_A,
                                          int m_T,
                                          dcomplex* buff_A, int rs_A, int cs_A, 
                                          dcomplex* buff_T, int rs_T, int cs_T );

FLA_Error FLA_Tridiag_UT_l_ofu_var2( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_step_ofu_var2( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_step_ofs_var2( int m_A,
                                          int m_T,
                                          float* buff_A, int rs_A, int cs_A, 
                                          float* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_l_step_ofd_var2( int m_A,
                                          int m_T,
                                          double* buff_A, int rs_A, int cs_A, 
                                          double* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_l_step_ofc_var2( int m_A,
                                          int m_T,
                                          scomplex* buff_A, int rs_A, int cs_A, 
                                          scomplex* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_l_step_ofz_var2( int m_A,
                                          int m_T,
                                          dcomplex* buff_A, int rs_A, int cs_A, 
                                          dcomplex* buff_T, int rs_T, int cs_T );

FLA_Error FLA_Tridiag_UT_l_ofu_var3( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_step_ofu_var3( FLA_Obj A, FLA_Obj Z, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_l_step_ofs_var3( int m_A,
                                          int m_T,
                                          float* buff_A, int rs_A, int cs_A, 
                                          float* buff_Z, int rs_Z, int cs_Z,
                                          float* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_l_step_ofd_var3( int m_A,
                                          int m_T,
                                          double* buff_A, int rs_A, int cs_A, 
                                          double* buff_Z, int rs_Z, int cs_Z,
                                          double* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_l_step_ofc_var3( int m_A,
                                          int m_T,
                                          scomplex* buff_A, int rs_A, int cs_A, 
                                          scomplex* buff_Z, int rs_Z, int cs_Z,
                                          scomplex* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_l_step_ofz_var3( int m_A,
                                          int m_T,
                                          dcomplex* buff_A, int rs_A, int cs_A, 
                                          dcomplex* buff_Z, int rs_Z, int cs_Z,
                                          dcomplex* buff_T, int rs_T, int cs_T );

// --- Fused operations ---

FLA_Error FLA_Fused_Her2_Ax_l_opt_var1( FLA_Obj alpha, FLA_Obj u, FLA_Obj z, FLA_Obj A, FLA_Obj x, FLA_Obj w );
FLA_Error FLA_Fused_Her2_Ax_l_ops_var1( int m_A,
                                        float* buff_alpha, 
                                        float* buff_u, int inc_u, 
                                        float* buff_z, int inc_z, 
                                        float* buff_A, int rs_A, int cs_A, 
                                        float* buff_x, int inc_x, 
                                        float* buff_w, int inc_w );
FLA_Error FLA_Fused_Her2_Ax_l_opd_var1( int m_A,
                                        double* buff_alpha, 
                                        double* buff_u, int inc_u, 
                                        double* buff_z, int inc_z, 
                                        double* buff_A, int rs_A, int cs_A, 
                                        double* buff_x, int inc_x, 
                                        double* buff_w, int inc_w );
FLA_Error FLA_Fused_Her2_Ax_l_opc_var1( int m_A,
                                        scomplex* buff_alpha, 
                                        scomplex* buff_u, int inc_u, 
                                        scomplex* buff_z, int inc_z, 
                                        scomplex* buff_A, int rs_A, int cs_A, 
                                        scomplex* buff_x, int inc_x, 
                                        scomplex* buff_w, int inc_w );
FLA_Error FLA_Fused_Her2_Ax_l_opz_var1( int m_A,
                                        dcomplex* buff_alpha, 
                                        dcomplex* buff_u, int inc_u, 
                                        dcomplex* buff_z, int inc_z, 
                                        dcomplex* buff_A, int rs_A, int cs_A, 
                                        dcomplex* buff_x, int inc_x, 
                                        dcomplex* buff_w, int inc_w );

FLA_Error FLA_Fused_UZhu_ZUhu_opt_var1( FLA_Obj delta, FLA_Obj U, FLA_Obj Z, FLA_Obj t, FLA_Obj u, FLA_Obj w );
FLA_Error FLA_Fused_UZhu_ZUhu_ops_var1( int m_U,
                                        int n_U,
                                        float* buff_delta, 
                                        float* buff_U, int rs_U, int cs_U, 
                                        float* buff_Z, int rs_Z, int cs_Z, 
                                        float* buff_t, int inc_t, 
                                        float* buff_u, int inc_u, 
                                        float* buff_w, int inc_w );
FLA_Error FLA_Fused_UZhu_ZUhu_opd_var1( int m_U,
                                        int n_U,
                                        double* buff_delta, 
                                        double* buff_U, int rs_U, int cs_U, 
                                        double* buff_Z, int rs_Z, int cs_Z, 
                                        double* buff_t, int inc_t, 
                                        double* buff_u, int inc_u, 
                                        double* buff_w, int inc_w );
FLA_Error FLA_Fused_UZhu_ZUhu_opc_var1( int m_U,
                                        int n_U,
                                        scomplex* buff_delta, 
                                        scomplex* buff_U, int rs_U, int cs_U, 
                                        scomplex* buff_Z, int rs_Z, int cs_Z, 
                                        scomplex* buff_t, int inc_t, 
                                        scomplex* buff_u, int inc_u, 
                                        scomplex* buff_w, int inc_w );
FLA_Error FLA_Fused_UZhu_ZUhu_opz_var1( int m_U,
                                        int n_U,
                                        dcomplex* buff_delta, 
                                        dcomplex* buff_U, int rs_U, int cs_U, 
                                        dcomplex* buff_Z, int rs_Z, int cs_Z, 
                                        dcomplex* buff_t, int inc_t, 
                                        dcomplex* buff_u, int inc_u, 
                                        dcomplex* buff_w, int inc_w );
// end FLA_Tridiag_UT_l.h
//#include "FLA_Tridiag_UT_u.h"

FLA_Error FLA_Tridiag_UT( FLA_Uplo uplo, FLA_Obj A, FLA_Obj T );

FLA_Error FLA_Tridiag_UT_internal( FLA_Uplo uplo, FLA_Obj A, FLA_Obj T, fla_tridiagut_t* cntl );

FLA_Error FLA_Tridiag_UT_l( FLA_Obj A, FLA_Obj T, fla_tridiagut_t* cntl );
FLA_Error FLA_Tridiag_UT_u( FLA_Obj A, FLA_Obj T, fla_tridiagut_t* cntl );

FLA_Error FLA_Tridiag_UT_create_T( FLA_Obj A, FLA_Obj* T );
FLA_Error FLA_Tridiag_UT_recover_tau( FLA_Obj T, FLA_Obj t );

FLA_Error FLA_Tridiag_UT_scale_diagonals( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A );

FLA_Error FLA_Tridiag_UT_extract_diagonals( FLA_Uplo uplo, FLA_Obj A, FLA_Obj d, FLA_Obj e );
FLA_Error FLA_Tridiag_UT_extract_real_diagonals( FLA_Uplo uplo, FLA_Obj A, FLA_Obj d, FLA_Obj e );
//// FLA_Error FLA_Tridiag_UT_l_extract_diagonals( FLA_Obj A, FLA_Obj d, FLA_Obj e );
//// FLA_Error FLA_Tridiag_UT_u_extract_diagonals( FLA_Obj A, FLA_Obj d, FLA_Obj e );

FLA_Error FLA_Tridiag_UT_realify( FLA_Uplo uplo, FLA_Obj A, FLA_Obj d );
FLA_Error FLA_Tridiag_UT_l_realify_unb( FLA_Obj A, FLA_Obj d );
FLA_Error FLA_Tridiag_UT_l_realify_opt( FLA_Obj A, FLA_Obj d );
FLA_Error FLA_Tridiag_UT_u_realify_unb( FLA_Obj A, FLA_Obj d );
FLA_Error FLA_Tridiag_UT_u_realify_opt( FLA_Obj A, FLA_Obj d );

FLA_Error FLA_Tridiag_UT_realify_subdiagonal( FLA_Obj b, FLA_Obj d );
FLA_Error FLA_Tridiag_UT_realify_subdiagonal_opt( FLA_Obj b, FLA_Obj d );

FLA_Error FLA_Tridiag_UT_shift_U( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLA_Tridiag_UT_shift_U_l_ops( int       m_A,
                                        float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Tridiag_UT_shift_U_u_ops( int       m_A,
                                        float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Tridiag_UT_shift_U_l_opd( int       m_A,
                                        double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Tridiag_UT_shift_U_u_opd( int       m_A,
                                        double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Tridiag_UT_shift_U_l_opc( int       m_A,
                                        scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Tridiag_UT_shift_U_u_opc( int       m_A,
                                        scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Tridiag_UT_shift_U_l_opz( int       m_A,
                                        dcomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Tridiag_UT_shift_U_u_opz( int       m_A,
                                        dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Tridiag_UT_form_Q( FLA_Uplo uplo, FLA_Obj A, FLA_Obj T, FLA_Obj Q );
FLA_Error FLA_Tridiag_UT_form_Q_l_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W );
FLA_Error FLA_Tridiag_UT_form_Q_u_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W );
FLA_Error FLA_Tridiag_UT_form_Q_l_opt_var1( FLA_Obj A, FLA_Obj T );
FLA_Error FLA_Tridiag_UT_form_Q_l_ops_var1( int       m_A,
                                            int       n_AT,
                                            float*    buff_A, int rs_A, int cs_A,
                                            float*    buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_form_Q_l_opd_var1( int       m_A,
                                            int       n_AT,
                                            double*   buff_A, int rs_A, int cs_A,
                                            double*   buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_form_Q_l_opc_var1( int       m_A,
                                            int       n_AT,
                                            scomplex* buff_A, int rs_A, int cs_A,
                                            scomplex* buff_T, int rs_T, int cs_T );
FLA_Error FLA_Tridiag_UT_form_Q_l_opz_var1( int       m_A,
                                            int       n_AT,
                                            dcomplex* buff_A, int rs_A, int cs_A,
                                            dcomplex* buff_T, int rs_T, int cs_T );
// end FLA_Tridiag_UT.h
// begin FLA_Bidiag_UT.h


//#include "FLA_Bidiag_UT_l.h"
// begin FLA_Bidiag_UT_u.h


FLA_Error FLA_Bidiag_UT_u_unb_var1( FLA_Obj A, FLA_Obj TU, FLA_Obj TV );
FLA_Error FLA_Bidiag_UT_u_blk_var1( FLA_Obj A, FLA_Obj TU, FLA_Obj TV );
FLA_Error FLA_Bidiag_UT_u_step_unb_var1( FLA_Obj A, FLA_Obj TU, FLA_Obj TV );

FLA_Error FLA_Bidiag_UT_u_unb_var2( FLA_Obj A, FLA_Obj TU, FLA_Obj TV );
FLA_Error FLA_Bidiag_UT_u_blk_var2( FLA_Obj A, FLA_Obj TU, FLA_Obj TV );
FLA_Error FLA_Bidiag_UT_u_blf_var2( FLA_Obj A, FLA_Obj TU, FLA_Obj TV );
FLA_Error FLA_Bidiag_UT_u_step_unb_var2( FLA_Obj A, FLA_Obj TU, FLA_Obj TV );

FLA_Error FLA_Bidiag_UT_u_unb_var3( FLA_Obj A, FLA_Obj TU, FLA_Obj TV );
FLA_Error FLA_Bidiag_UT_u_blk_var3( FLA_Obj A, FLA_Obj TU, FLA_Obj TV );
FLA_Error FLA_Bidiag_UT_u_blf_var3( FLA_Obj A, FLA_Obj TU, FLA_Obj TV );
FLA_Error FLA_Bidiag_UT_u_step_unb_var3( FLA_Obj A, FLA_Obj TU, FLA_Obj TV );

FLA_Error FLA_Bidiag_UT_u_unb_var4( FLA_Obj A, FLA_Obj TU, FLA_Obj TV );
FLA_Error FLA_Bidiag_UT_u_blk_var4( FLA_Obj A, FLA_Obj TU, FLA_Obj TV );
FLA_Error FLA_Bidiag_UT_u_blf_var4( FLA_Obj A, FLA_Obj TU, FLA_Obj TV );
FLA_Error FLA_Bidiag_UT_u_step_unb_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj Z, FLA_Obj TU, FLA_Obj TV );

FLA_Error FLA_Bidiag_UT_u_unb_var5( FLA_Obj A, FLA_Obj TU, FLA_Obj TV );
FLA_Error FLA_Bidiag_UT_u_blk_var5( FLA_Obj A, FLA_Obj TU, FLA_Obj TV );
FLA_Error FLA_Bidiag_UT_u_step_unb_var5( FLA_Obj A, FLA_Obj Y, FLA_Obj Z, FLA_Obj TU, FLA_Obj TV );

FLA_Error FLA_Bidiag_UT_u_opt_var1( FLA_Obj A, FLA_Obj T, FLA_Obj S );
FLA_Error FLA_Bidiag_UT_u_step_opt_var1( FLA_Obj A, FLA_Obj T, FLA_Obj S );
FLA_Error FLA_Bidiag_UT_u_step_ops_var1( int m_A,
                                         int n_A,
                                         int m_TS,
                                         float* buff_A, int rs_A, int cs_A, 
                                         float* buff_T, int rs_T, int cs_T, 
                                         float* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_opd_var1( int m_A,
                                         int n_A,
                                         int m_TS,
                                         double* buff_A, int rs_A, int cs_A, 
                                         double* buff_T, int rs_T, int cs_T, 
                                         double* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_opc_var1( int m_A,
                                         int n_A,
                                         int m_TS,
                                         scomplex* buff_A, int rs_A, int cs_A, 
                                         scomplex* buff_T, int rs_T, int cs_T, 
                                         scomplex* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_opz_var1( int m_A,
                                         int n_A,
                                         int m_TS,
                                         dcomplex* buff_A, int rs_A, int cs_A, 
                                         dcomplex* buff_T, int rs_T, int cs_T, 
                                         dcomplex* buff_S, int rs_S, int cs_S );

FLA_Error FLA_Bidiag_UT_u_opt_var2( FLA_Obj A, FLA_Obj T, FLA_Obj S );
FLA_Error FLA_Bidiag_UT_u_step_opt_var2( FLA_Obj A, FLA_Obj T, FLA_Obj S );
FLA_Error FLA_Bidiag_UT_u_step_ops_var2( int m_A,
                                         int n_A,
                                         int m_TS,
                                         float* buff_A, int rs_A, int cs_A, 
                                         float* buff_T, int rs_T, int cs_T, 
                                         float* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_opd_var2( int m_A,
                                         int n_A,
                                         int m_TS,
                                         double* buff_A, int rs_A, int cs_A, 
                                         double* buff_T, int rs_T, int cs_T, 
                                         double* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_opc_var2( int m_A,
                                         int n_A,
                                         int m_TS,
                                         scomplex* buff_A, int rs_A, int cs_A, 
                                         scomplex* buff_T, int rs_T, int cs_T, 
                                         scomplex* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_opz_var2( int m_A,
                                         int n_A,
                                         int m_TS,
                                         dcomplex* buff_A, int rs_A, int cs_A, 
                                         dcomplex* buff_T, int rs_T, int cs_T, 
                                         dcomplex* buff_S, int rs_S, int cs_S );

FLA_Error FLA_Bidiag_UT_u_opt_var3( FLA_Obj A, FLA_Obj T, FLA_Obj S );
FLA_Error FLA_Bidiag_UT_u_step_opt_var3( FLA_Obj A, FLA_Obj T, FLA_Obj S );
FLA_Error FLA_Bidiag_UT_u_step_ops_var3( int m_A,
                                         int n_A,
                                         int m_TS,
                                         float* buff_A, int rs_A, int cs_A, 
                                         float* buff_T, int rs_T, int cs_T, 
                                         float* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_opd_var3( int m_A,
                                         int n_A,
                                         int m_TS,
                                         double* buff_A, int rs_A, int cs_A, 
                                         double* buff_T, int rs_T, int cs_T, 
                                         double* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_opc_var3( int m_A,
                                         int n_A,
                                         int m_TS,
                                         scomplex* buff_A, int rs_A, int cs_A, 
                                         scomplex* buff_T, int rs_T, int cs_T, 
                                         scomplex* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_opz_var3( int m_A,
                                         int n_A,
                                         int m_TS,
                                         dcomplex* buff_A, int rs_A, int cs_A, 
                                         dcomplex* buff_T, int rs_T, int cs_T, 
                                         dcomplex* buff_S, int rs_S, int cs_S );

FLA_Error FLA_Bidiag_UT_u_opt_var4( FLA_Obj A, FLA_Obj T, FLA_Obj S );
FLA_Error FLA_Bidiag_UT_u_step_opt_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj Z, FLA_Obj T, FLA_Obj S );
FLA_Error FLA_Bidiag_UT_u_step_ops_var4( int m_A,
                                         int n_A,
                                         int m_TS,
                                         float* buff_A, int rs_A, int cs_A, 
                                         float* buff_Y, int rs_Y, int cs_Y, 
                                         float* buff_Z, int rs_Z, int cs_Z, 
                                         float* buff_T, int rs_T, int cs_T, 
                                         float* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_opd_var4( int m_A,
                                         int n_A,
                                         int m_TS,
                                         double* buff_A, int rs_A, int cs_A, 
                                         double* buff_Y, int rs_Y, int cs_Y, 
                                         double* buff_Z, int rs_Z, int cs_Z, 
                                         double* buff_T, int rs_T, int cs_T, 
                                         double* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_opc_var4( int m_A,
                                         int n_A,
                                         int m_TS,
                                         scomplex* buff_A, int rs_A, int cs_A, 
                                         scomplex* buff_Y, int rs_Y, int cs_Y, 
                                         scomplex* buff_Z, int rs_Z, int cs_Z, 
                                         scomplex* buff_T, int rs_T, int cs_T, 
                                         scomplex* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_opz_var4( int m_A,
                                         int n_A,
                                         int m_TS,
                                         dcomplex* buff_A, int rs_A, int cs_A, 
                                         dcomplex* buff_Y, int rs_Y, int cs_Y, 
                                         dcomplex* buff_Z, int rs_Z, int cs_Z, 
                                         dcomplex* buff_T, int rs_T, int cs_T, 
                                         dcomplex* buff_S, int rs_S, int cs_S );

FLA_Error FLA_Bidiag_UT_u_opt_var5( FLA_Obj A, FLA_Obj T, FLA_Obj S );
FLA_Error FLA_Bidiag_UT_u_step_opt_var5( FLA_Obj A, FLA_Obj Y, FLA_Obj Z, FLA_Obj T, FLA_Obj S );
FLA_Error FLA_Bidiag_UT_u_step_ops_var5( int m_A,
                                         int n_A,
                                         int m_TS,
                                         float* buff_A, int rs_A, int cs_A, 
                                         float* buff_Y, int rs_Y, int cs_Y, 
                                         float* buff_Z, int rs_Z, int cs_Z, 
                                         float* buff_T, int rs_T, int cs_T, 
                                         float* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_opd_var5( int m_A,
                                         int n_A,
                                         int m_TS,
                                         double* buff_A, int rs_A, int cs_A, 
                                         double* buff_Y, int rs_Y, int cs_Y, 
                                         double* buff_Z, int rs_Z, int cs_Z, 
                                         double* buff_T, int rs_T, int cs_T, 
                                         double* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_opc_var5( int m_A,
                                         int n_A,
                                         int m_TS,
                                         scomplex* buff_A, int rs_A, int cs_A, 
                                         scomplex* buff_Y, int rs_Y, int cs_Y, 
                                         scomplex* buff_Z, int rs_Z, int cs_Z, 
                                         scomplex* buff_T, int rs_T, int cs_T, 
                                         scomplex* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_opz_var5( int m_A,
                                         int n_A,
                                         int m_TS,
                                         dcomplex* buff_A, int rs_A, int cs_A, 
                                         dcomplex* buff_Y, int rs_Y, int cs_Y, 
                                         dcomplex* buff_Z, int rs_Z, int cs_Z, 
                                         dcomplex* buff_T, int rs_T, int cs_T, 
                                         dcomplex* buff_S, int rs_S, int cs_S );


FLA_Error FLA_Bidiag_UT_u_ofu_var2( FLA_Obj A, FLA_Obj T, FLA_Obj S );
FLA_Error FLA_Bidiag_UT_u_step_ofu_var2( FLA_Obj A, FLA_Obj T, FLA_Obj S );
FLA_Error FLA_Bidiag_UT_u_step_ofs_var2( int m_A,
                                         int n_A,
                                         int m_TS,
                                         float* buff_A, int rs_A, int cs_A, 
                                         float* buff_T, int rs_T, int cs_T, 
                                         float* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_ofd_var2( int m_A,
                                         int n_A,
                                         int m_TS,
                                         double* buff_A, int rs_A, int cs_A, 
                                         double* buff_T, int rs_T, int cs_T, 
                                         double* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_ofc_var2( int m_A,
                                         int n_A,
                                         int m_TS,
                                         scomplex* buff_A, int rs_A, int cs_A, 
                                         scomplex* buff_T, int rs_T, int cs_T, 
                                         scomplex* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_ofz_var2( int m_A,
                                         int n_A,
                                         int m_TS,
                                         dcomplex* buff_A, int rs_A, int cs_A, 
                                         dcomplex* buff_T, int rs_T, int cs_T, 
                                         dcomplex* buff_S, int rs_S, int cs_S );

FLA_Error FLA_Bidiag_UT_u_ofu_var3( FLA_Obj A, FLA_Obj T, FLA_Obj S );
FLA_Error FLA_Bidiag_UT_u_step_ofu_var3( FLA_Obj A, FLA_Obj T, FLA_Obj S );
FLA_Error FLA_Bidiag_UT_u_step_ofs_var3( int m_A,
                                         int n_A,
                                         int m_TS,
                                         float* buff_A, int rs_A, int cs_A, 
                                         float* buff_T, int rs_T, int cs_T, 
                                         float* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_ofd_var3( int m_A,
                                         int n_A,
                                         int m_TS,
                                         double* buff_A, int rs_A, int cs_A, 
                                         double* buff_T, int rs_T, int cs_T, 
                                         double* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_ofc_var3( int m_A,
                                         int n_A,
                                         int m_TS,
                                         scomplex* buff_A, int rs_A, int cs_A, 
                                         scomplex* buff_T, int rs_T, int cs_T, 
                                         scomplex* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_ofz_var3( int m_A,
                                         int n_A,
                                         int m_TS,
                                         dcomplex* buff_A, int rs_A, int cs_A, 
                                         dcomplex* buff_T, int rs_T, int cs_T, 
                                         dcomplex* buff_S, int rs_S, int cs_S );

FLA_Error FLA_Bidiag_UT_u_ofu_var4( FLA_Obj A, FLA_Obj T, FLA_Obj S );
FLA_Error FLA_Bidiag_UT_u_step_ofu_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj Z, FLA_Obj T, FLA_Obj S );
FLA_Error FLA_Bidiag_UT_u_step_ofs_var4( int m_A,
                                         int n_A,
                                         int m_TS,
                                         float* buff_A, int rs_A, int cs_A, 
                                         float* buff_Y, int rs_Y, int cs_Y, 
                                         float* buff_Z, int rs_Z, int cs_Z, 
                                         float* buff_T, int rs_T, int cs_T, 
                                         float* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_ofd_var4( int m_A,
                                         int n_A,
                                         int m_TS,
                                         double* buff_A, int rs_A, int cs_A, 
                                         double* buff_Y, int rs_Y, int cs_Y, 
                                         double* buff_Z, int rs_Z, int cs_Z, 
                                         double* buff_T, int rs_T, int cs_T, 
                                         double* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_ofc_var4( int m_A,
                                         int n_A,
                                         int m_TS,
                                         scomplex* buff_A, int rs_A, int cs_A, 
                                         scomplex* buff_Y, int rs_Y, int cs_Y, 
                                         scomplex* buff_Z, int rs_Z, int cs_Z, 
                                         scomplex* buff_T, int rs_T, int cs_T, 
                                         scomplex* buff_S, int rs_S, int cs_S );
FLA_Error FLA_Bidiag_UT_u_step_ofz_var4( int m_A,
                                         int n_A,
                                         int m_TS,
                                         dcomplex* buff_A, int rs_A, int cs_A, 
                                         dcomplex* buff_Y, int rs_Y, int cs_Y, 
                                         dcomplex* buff_Z, int rs_Z, int cs_Z, 
                                         dcomplex* buff_T, int rs_T, int cs_T, 
                                         dcomplex* buff_S, int rs_S, int cs_S );

// --- Fused operations ---

FLA_Error FLA_Fused_Gerc2_opt_var1( FLA_Obj alpha, FLA_Obj u, FLA_Obj y, FLA_Obj z, FLA_Obj v, FLA_Obj A );
FLA_Error FLA_Fused_Gerc2_ops_var1( int m_A,
                                    int n_A,
                                    float* buff_alpha, 
                                    float* buff_u, int inc_u, 
                                    float* buff_y, int inc_y, 
                                    float* buff_z, int inc_z, 
                                    float* buff_v, int inc_v, 
                                    float* buff_A, int rs_A, int cs_A ); 
FLA_Error FLA_Fused_Gerc2_opd_var1( int m_A,
                                    int n_A,
                                    double* buff_alpha, 
                                    double* buff_u, int inc_u, 
                                    double* buff_y, int inc_y, 
                                    double* buff_z, int inc_z, 
                                    double* buff_v, int inc_v, 
                                    double* buff_A, int rs_A, int cs_A ); 
FLA_Error FLA_Fused_Gerc2_opc_var1( int m_A,
                                    int n_A,
                                    scomplex* buff_alpha, 
                                    scomplex* buff_u, int inc_u, 
                                    scomplex* buff_y, int inc_y, 
                                    scomplex* buff_z, int inc_z, 
                                    scomplex* buff_v, int inc_v, 
                                    scomplex* buff_A, int rs_A, int cs_A ); 
FLA_Error FLA_Fused_Gerc2_opz_var1( int m_A,
                                    int n_A,
                                    dcomplex* buff_alpha, 
                                    dcomplex* buff_u, int inc_u, 
                                    dcomplex* buff_y, int inc_y, 
                                    dcomplex* buff_z, int inc_z, 
                                    dcomplex* buff_v, int inc_v, 
                                    dcomplex* buff_A, int rs_A, int cs_A ); 


FLA_Error FLA_Fused_Ahx_Axpy_Ax_opt_var1( FLA_Obj A, FLA_Obj u, FLA_Obj tau, FLA_Obj a, FLA_Obj beta, FLA_Obj y, FLA_Obj w );
FLA_Error FLA_Fused_Ahx_Axpy_Ax_ops_var1( int m_A,
                                          int n_A,
                                          float* buff_tau, 
                                          float* buff_beta, 
                                          float* buff_A, int rs_A, int cs_A, 
                                          float* buff_u, int inc_u, 
                                          float* buff_a, int inc_a, 
                                          float* buff_y, int inc_y, 
                                          float* buff_w, int inc_w );
FLA_Error FLA_Fused_Ahx_Axpy_Ax_opd_var1( int m_A,
                                          int n_A,
                                          double* buff_tau, 
                                          double* buff_beta, 
                                          double* buff_A, int rs_A, int cs_A, 
                                          double* buff_u, int inc_u, 
                                          double* buff_a, int inc_a, 
                                          double* buff_y, int inc_y, 
                                          double* buff_w, int inc_w );
FLA_Error FLA_Fused_Ahx_Axpy_Ax_opc_var1( int m_A,
                                          int n_A,
                                          scomplex* buff_tau, 
                                          scomplex* buff_beta, 
                                          scomplex* buff_A, int rs_A, int cs_A, 
                                          scomplex* buff_u, int inc_u, 
                                          scomplex* buff_a, int inc_a, 
                                          scomplex* buff_y, int inc_y, 
                                          scomplex* buff_w, int inc_w );
FLA_Error FLA_Fused_Ahx_Axpy_Ax_opz_var1( int m_A,
                                          int n_A,
                                          dcomplex* buff_tau, 
                                          dcomplex* buff_beta, 
                                          dcomplex* buff_A, int rs_A, int cs_A, 
                                          dcomplex* buff_u, int inc_u, 
                                          dcomplex* buff_a, int inc_a, 
                                          dcomplex* buff_y, int inc_y, 
                                          dcomplex* buff_w, int inc_w );

FLA_Error FLA_Fused_Gerc2_Ahx_Axpy_Ax_opt_var1( FLA_Obj alpha, FLA_Obj tau, FLA_Obj u, FLA_Obj y, FLA_Obj z, FLA_Obj v, FLA_Obj A, FLA_Obj up, FLA_Obj a, FLA_Obj w );
FLA_Error FLA_Fused_Gerc2_Ahx_Axpy_Ax_ops_var1( int m_A,
                                                int n_A,
                                                float* buff_tau, 
                                                float* buff_alpha, 
                                                float* buff_u, int inc_u, 
                                                float* buff_y, int inc_y, 
                                                float* buff_z, int inc_z, 
                                                float* buff_v, int inc_v, 
                                                float* buff_A, int rs_A, int cs_A, 
                                                float* buff_up, int inc_up, 
                                                float* buff_a, int inc_a, 
                                                float* buff_w, int inc_w );
FLA_Error FLA_Fused_Gerc2_Ahx_Axpy_Ax_opd_var1( int m_A,
                                                int n_A,
                                                double* buff_tau, 
                                                double* buff_alpha, 
                                                double* buff_u, int inc_u, 
                                                double* buff_y, int inc_y, 
                                                double* buff_z, int inc_z, 
                                                double* buff_v, int inc_v, 
                                                double* buff_A, int rs_A, int cs_A, 
                                                double* buff_up, int inc_up, 
                                                double* buff_a, int inc_a, 
                                                double* buff_w, int inc_w );
FLA_Error FLA_Fused_Gerc2_Ahx_Axpy_Ax_opc_var1( int m_A,
                                                int n_A,
                                                scomplex* buff_tau, 
                                                scomplex* buff_alpha, 
                                                scomplex* buff_u, int inc_u, 
                                                scomplex* buff_y, int inc_y, 
                                                scomplex* buff_z, int inc_z, 
                                                scomplex* buff_v, int inc_v, 
                                                scomplex* buff_A, int rs_A, int cs_A, 
                                                scomplex* buff_up, int inc_up, 
                                                scomplex* buff_a, int inc_a, 
                                                scomplex* buff_w, int inc_w );
FLA_Error FLA_Fused_Gerc2_Ahx_Axpy_Ax_opz_var1( int m_A,
                                                int n_A,
                                                dcomplex* buff_tau, 
                                                dcomplex* buff_alpha, 
                                                dcomplex* buff_u, int inc_u, 
                                                dcomplex* buff_y, int inc_y, 
                                                dcomplex* buff_z, int inc_z, 
                                                dcomplex* buff_v, int inc_v, 
                                                dcomplex* buff_A, int rs_A, int cs_A, 
                                                dcomplex* buff_up, int inc_up, 
                                                dcomplex* buff_a, int inc_a, 
                                                dcomplex* buff_w, int inc_w );

FLA_Error FLA_Fused_UYx_ZVx_opt_var1( FLA_Obj delta, FLA_Obj a, FLA_Obj U, FLA_Obj Y, FLA_Obj Z, FLA_Obj V, FLA_Obj A, FLA_Obj temp, FLA_Obj t, FLA_Obj w, FLA_Obj al );
FLA_Error FLA_Fused_UYx_ZVx_ops_var1( int m_U,
                                      int n_U,
                                      int m_V,
                                      int n_V,
                                      float* buff_delta, 
                                      float* buff_U, int rs_U, int cs_U, 
                                      float* buff_Y, int rs_Y, int cs_Y, 
                                      float* buff_Z, int rs_Z, int cs_Z, 
                                      float* buff_V, int rs_V, int cs_V, 
                                      float* buff_A, int rs_A, int cs_A, 
                                      float* buff_temp, int inc_temp, 
                                      float* buff_t, int inc_t, 
                                      float* buff_a, int inc_a, 
                                      float* buff_w, int inc_w, 
                                      float* buff_al, int inc_al );
FLA_Error FLA_Fused_UYx_ZVx_opd_var1( int m_U,
                                      int n_U,
                                      int m_V,
                                      int n_V,
                                      double* buff_delta, 
                                      double* buff_U, int rs_U, int cs_U, 
                                      double* buff_Y, int rs_Y, int cs_Y, 
                                      double* buff_Z, int rs_Z, int cs_Z, 
                                      double* buff_V, int rs_V, int cs_V, 
                                      double* buff_A, int rs_A, int cs_A, 
                                      double* buff_temp, int inc_temp, 
                                      double* buff_t, int inc_t, 
                                      double* buff_a, int inc_a, 
                                      double* buff_w, int inc_w, 
                                      double* buff_al, int inc_al );
FLA_Error FLA_Fused_UYx_ZVx_opc_var1( int m_U,
                                      int n_U,
                                      int m_V,
                                      int n_V,
                                      scomplex* buff_delta, 
                                      scomplex* buff_U, int rs_U, int cs_U, 
                                      scomplex* buff_Y, int rs_Y, int cs_Y, 
                                      scomplex* buff_Z, int rs_Z, int cs_Z, 
                                      scomplex* buff_V, int rs_V, int cs_V, 
                                      scomplex* buff_A, int rs_A, int cs_A, 
                                      scomplex* buff_temp, int inc_temp, 
                                      scomplex* buff_t, int inc_t, 
                                      scomplex* buff_a, int inc_a, 
                                      scomplex* buff_w, int inc_w, 
                                      scomplex* buff_al, int inc_al );
FLA_Error FLA_Fused_UYx_ZVx_opz_var1( int m_U,
                                      int n_U,
                                      int m_V,
                                      int n_V,
                                      dcomplex* buff_delta, 
                                      dcomplex* buff_U, int rs_U, int cs_U, 
                                      dcomplex* buff_Y, int rs_Y, int cs_Y, 
                                      dcomplex* buff_Z, int rs_Z, int cs_Z, 
                                      dcomplex* buff_V, int rs_V, int cs_V, 
                                      dcomplex* buff_A, int rs_A, int cs_A, 
                                      dcomplex* buff_temp, int inc_temp, 
                                      dcomplex* buff_t, int inc_t, 
                                      dcomplex* buff_a, int inc_a, 
                                      dcomplex* buff_w, int inc_w, 
                                      dcomplex* buff_al, int inc_al );
// end FLA_Bidiag_UT_u.h

FLA_Error FLA_Bidiag_UT( FLA_Obj A, FLA_Obj TU, FLA_Obj TV );

FLA_Error FLA_Bidiag_UT_internal( FLA_Obj A, FLA_Obj TU, FLA_Obj TV, fla_bidiagut_t* cntl );

FLA_Error FLA_Bidiag_UT_l( FLA_Obj A, FLA_Obj TU, FLA_Obj TV, fla_bidiagut_t* cntl );
FLA_Error FLA_Bidiag_UT_u( FLA_Obj A, FLA_Obj TU, FLA_Obj TV, fla_bidiagut_t* cntl );

FLA_Error FLA_Bidiag_UT_create_T( FLA_Obj A, FLA_Obj* TU, FLA_Obj* TV );

FLA_Error FLA_Bidiag_UT_recover_tau( FLA_Obj TU, FLA_Obj TV, FLA_Obj tu, FLA_Obj tv );

FLA_Error FLA_Bidiag_UT_extract_diagonals( FLA_Obj A, FLA_Obj d, FLA_Obj e );
FLA_Error FLA_Bidiag_UT_u_extract_diagonals( FLA_Obj A, FLA_Obj d, FLA_Obj e );
FLA_Error FLA_Bidiag_UT_l_extract_diagonals( FLA_Obj A, FLA_Obj d, FLA_Obj e );

FLA_Error FLA_Bidiag_UT_extract_real_diagonals( FLA_Obj A, FLA_Obj d, FLA_Obj e );
FLA_Error FLA_Bidiag_UT_u_extract_real_diagonals( FLA_Obj A, FLA_Obj d, FLA_Obj e );
FLA_Error FLA_Bidiag_UT_l_extract_real_diagonals( FLA_Obj A, FLA_Obj d, FLA_Obj e );

FLA_Error FLA_Bidiag_UT_scale_diagonals( FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Bidiag_UT_u_scale_diagonals( FLA_Obj alpha, FLA_Obj A );
FLA_Error FLA_Bidiag_UT_l_scale_diagonals( FLA_Obj alpha, FLA_Obj A );

FLA_Error FLA_Bidiag_UT_realify( FLA_Obj A, FLA_Obj d, FLA_Obj e );
FLA_Error FLA_Bidiag_UT_l_realify_unb( FLA_Obj A, FLA_Obj d, FLA_Obj e );
FLA_Error FLA_Bidiag_UT_l_realify_opt( FLA_Obj A, FLA_Obj d, FLA_Obj e );
FLA_Error FLA_Bidiag_UT_u_realify_unb( FLA_Obj A, FLA_Obj d, FLA_Obj e );
FLA_Error FLA_Bidiag_UT_u_realify_opt( FLA_Obj A, FLA_Obj d, FLA_Obj e );

FLA_Error FLA_Bidiag_UT_realify_diagonals( FLA_Uplo uplo, FLA_Obj a, FLA_Obj b, FLA_Obj d, FLA_Obj e );
FLA_Error FLA_Bidiag_UT_realify_diagonals_opt( FLA_Obj a, FLA_Obj b, FLA_Obj d, FLA_Obj e );

FLA_Error FLA_Bidiag_UT_form_U( FLA_Obj A, FLA_Obj T, FLA_Obj U );
FLA_Error FLA_Bidiag_UT_form_V( FLA_Obj A, FLA_Obj S, FLA_Obj V );

FLA_Error FLA_Bidiag_UT_form_U_ext( FLA_Uplo uplo, FLA_Obj A, FLA_Obj T, FLA_Trans transu, FLA_Obj U );
FLA_Error FLA_Bidiag_UT_form_V_ext( FLA_Uplo uplo, FLA_Obj A, FLA_Obj S, FLA_Trans transv, FLA_Obj V );

// end FLA_Bidiag_UT.h

// Solves
// begin FLA_Lyap.h


// begin FLA_Lyap_n.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Lyap_n_unb_var1( FLA_Obj isgn, FLA_Obj A, FLA_Obj C );
FLA_Error FLA_Lyap_n_unb_var2( FLA_Obj isgn, FLA_Obj A, FLA_Obj C );
FLA_Error FLA_Lyap_n_unb_var3( FLA_Obj isgn, FLA_Obj A, FLA_Obj C );
FLA_Error FLA_Lyap_n_unb_var4( FLA_Obj isgn, FLA_Obj A, FLA_Obj C );

FLA_Error FLA_Lyap_n_blk_var1( FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl );
FLA_Error FLA_Lyap_n_blk_var2( FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl );
FLA_Error FLA_Lyap_n_blk_var3( FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl );
FLA_Error FLA_Lyap_n_blk_var4( FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl );

FLA_Error FLA_Lyap_n_opt_var1( FLA_Obj isgn, FLA_Obj A, FLA_Obj C );
FLA_Error FLA_Lyap_n_ops_var1( int m_AC,
                               float* buff_sgn,
                               float* buff_A, int rs_A, int cs_A, 
                               float* buff_W, int rs_W, int cs_W, 
                               float* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_n_opd_var1( int m_AC,
                               double* buff_sgn,
                               double* buff_A, int rs_A, int cs_A, 
                               double* buff_W, int rs_W, int cs_W, 
                               double* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_n_opc_var1( int m_AC,
                               scomplex* buff_sgn,
                               scomplex* buff_A, int rs_A, int cs_A, 
                               scomplex* buff_W, int rs_W, int cs_W, 
                               scomplex* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_n_opz_var1( int m_AC,
                               dcomplex* buff_sgn,
                               dcomplex* buff_A, int rs_A, int cs_A, 
                               dcomplex* buff_W, int rs_W, int cs_W, 
                               dcomplex* buff_C, int rs_C, int cs_C );

FLA_Error FLA_Lyap_n_opt_var2( FLA_Obj isgn, FLA_Obj A, FLA_Obj C );
FLA_Error FLA_Lyap_n_ops_var2( int m_AC,
                               float* buff_sgn,
                               float* buff_A, int rs_A, int cs_A, 
                               float* buff_W, int rs_W, int cs_W, 
                               float* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_n_opd_var2( int m_AC,
                               double* buff_sgn,
                               double* buff_A, int rs_A, int cs_A, 
                               double* buff_W, int rs_W, int cs_W, 
                               double* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_n_opc_var2( int m_AC,
                               scomplex* buff_sgn,
                               scomplex* buff_A, int rs_A, int cs_A, 
                               scomplex* buff_W, int rs_W, int cs_W, 
                               scomplex* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_n_opz_var2( int m_AC,
                               dcomplex* buff_sgn,
                               dcomplex* buff_A, int rs_A, int cs_A, 
                               dcomplex* buff_W, int rs_W, int cs_W, 
                               dcomplex* buff_C, int rs_C, int cs_C );

FLA_Error FLA_Lyap_n_opt_var3( FLA_Obj isgn, FLA_Obj A, FLA_Obj C );
FLA_Error FLA_Lyap_n_ops_var3( int m_AC,
                               float* buff_sgn,
                               float* buff_A, int rs_A, int cs_A, 
                               float* buff_W, int rs_W, int cs_W, 
                               float* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_n_opd_var3( int m_AC,
                               double* buff_sgn,
                               double* buff_A, int rs_A, int cs_A, 
                               double* buff_W, int rs_W, int cs_W, 
                               double* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_n_opc_var3( int m_AC,
                               scomplex* buff_sgn,
                               scomplex* buff_A, int rs_A, int cs_A, 
                               scomplex* buff_W, int rs_W, int cs_W, 
                               scomplex* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_n_opz_var3( int m_AC,
                               dcomplex* buff_sgn,
                               dcomplex* buff_A, int rs_A, int cs_A, 
                               dcomplex* buff_W, int rs_W, int cs_W, 
                               dcomplex* buff_C, int rs_C, int cs_C );

FLA_Error FLA_Lyap_n_opt_var4( FLA_Obj isgn, FLA_Obj A, FLA_Obj C );
FLA_Error FLA_Lyap_n_ops_var4( int m_AC,
                               float* buff_sgn,
                               float* buff_A, int rs_A, int cs_A, 
                               float* buff_W, int rs_W, int cs_W, 
                               float* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_n_opd_var4( int m_AC,
                               double* buff_sgn,
                               double* buff_A, int rs_A, int cs_A, 
                               double* buff_W, int rs_W, int cs_W, 
                               double* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_n_opc_var4( int m_AC,
                               scomplex* buff_sgn,
                               scomplex* buff_A, int rs_A, int cs_A, 
                               scomplex* buff_W, int rs_W, int cs_W, 
                               scomplex* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_n_opz_var4( int m_AC,
                               dcomplex* buff_sgn,
                               dcomplex* buff_A, int rs_A, int cs_A, 
                               dcomplex* buff_W, int rs_W, int cs_W, 
                               dcomplex* buff_C, int rs_C, int cs_C );
// end FLA_Lyap_n.h
// begin FLA_Lyap_h.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Lyap_h_unb_var1( FLA_Obj isgn, FLA_Obj A, FLA_Obj C );
FLA_Error FLA_Lyap_h_unb_var2( FLA_Obj isgn, FLA_Obj A, FLA_Obj C );
FLA_Error FLA_Lyap_h_unb_var3( FLA_Obj isgn, FLA_Obj A, FLA_Obj C );
FLA_Error FLA_Lyap_h_unb_var4( FLA_Obj isgn, FLA_Obj A, FLA_Obj C );

FLA_Error FLA_Lyap_h_blk_var1( FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl );
FLA_Error FLA_Lyap_h_blk_var2( FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl );
FLA_Error FLA_Lyap_h_blk_var3( FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl );
FLA_Error FLA_Lyap_h_blk_var4( FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl );

FLA_Error FLA_Lyap_h_opt_var1( FLA_Obj isgn, FLA_Obj A, FLA_Obj C );
FLA_Error FLA_Lyap_h_ops_var1( int m_AC,
                               float* buff_sgn,
                               float* buff_A, int rs_A, int cs_A, 
                               float* buff_W, int rs_W, int cs_W, 
                               float* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_h_opd_var1( int m_AC,
                               double* buff_sgn,
                               double* buff_A, int rs_A, int cs_A, 
                               double* buff_W, int rs_W, int cs_W, 
                               double* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_h_opc_var1( int m_AC,
                               scomplex* buff_sgn,
                               scomplex* buff_A, int rs_A, int cs_A, 
                               scomplex* buff_W, int rs_W, int cs_W, 
                               scomplex* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_h_opz_var1( int m_AC,
                               dcomplex* buff_sgn,
                               dcomplex* buff_A, int rs_A, int cs_A, 
                               dcomplex* buff_W, int rs_W, int cs_W, 
                               dcomplex* buff_C, int rs_C, int cs_C );

FLA_Error FLA_Lyap_h_opt_var2( FLA_Obj isgn, FLA_Obj A, FLA_Obj C );
FLA_Error FLA_Lyap_h_ops_var2( int m_AC,
                               float* buff_sgn,
                               float* buff_A, int rs_A, int cs_A, 
                               float* buff_W, int rs_W, int cs_W, 
                               float* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_h_opd_var2( int m_AC,
                               double* buff_sgn,
                               double* buff_A, int rs_A, int cs_A, 
                               double* buff_W, int rs_W, int cs_W, 
                               double* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_h_opc_var2( int m_AC,
                               scomplex* buff_sgn,
                               scomplex* buff_A, int rs_A, int cs_A, 
                               scomplex* buff_W, int rs_W, int cs_W, 
                               scomplex* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_h_opz_var2( int m_AC,
                               dcomplex* buff_sgn,
                               dcomplex* buff_A, int rs_A, int cs_A, 
                               dcomplex* buff_W, int rs_W, int cs_W, 
                               dcomplex* buff_C, int rs_C, int cs_C );

FLA_Error FLA_Lyap_h_opt_var3( FLA_Obj isgn, FLA_Obj A, FLA_Obj C );
FLA_Error FLA_Lyap_h_ops_var3( int m_AC,
                               float* buff_sgn,
                               float* buff_A, int rs_A, int cs_A, 
                               float* buff_W, int rs_W, int cs_W, 
                               float* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_h_opd_var3( int m_AC,
                               double* buff_sgn,
                               double* buff_A, int rs_A, int cs_A, 
                               double* buff_W, int rs_W, int cs_W, 
                               double* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_h_opc_var3( int m_AC,
                               scomplex* buff_sgn,
                               scomplex* buff_A, int rs_A, int cs_A, 
                               scomplex* buff_W, int rs_W, int cs_W, 
                               scomplex* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_h_opz_var3( int m_AC,
                               dcomplex* buff_sgn,
                               dcomplex* buff_A, int rs_A, int cs_A, 
                               dcomplex* buff_W, int rs_W, int cs_W, 
                               dcomplex* buff_C, int rs_C, int cs_C );

FLA_Error FLA_Lyap_h_opt_var4( FLA_Obj isgn, FLA_Obj A, FLA_Obj C );
FLA_Error FLA_Lyap_h_ops_var4( int m_AC,
                               float* buff_sgn,
                               float* buff_A, int rs_A, int cs_A, 
                               float* buff_W, int rs_W, int cs_W, 
                               float* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_h_opd_var4( int m_AC,
                               double* buff_sgn,
                               double* buff_A, int rs_A, int cs_A, 
                               double* buff_W, int rs_W, int cs_W, 
                               double* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_h_opc_var4( int m_AC,
                               scomplex* buff_sgn,
                               scomplex* buff_A, int rs_A, int cs_A, 
                               scomplex* buff_W, int rs_W, int cs_W, 
                               scomplex* buff_C, int rs_C, int cs_C );
FLA_Error FLA_Lyap_h_opz_var4( int m_AC,
                               dcomplex* buff_sgn,
                               dcomplex* buff_A, int rs_A, int cs_A, 
                               dcomplex* buff_W, int rs_W, int cs_W, 
                               dcomplex* buff_C, int rs_C, int cs_C );
// end FLA_Lyap_h.h

FLA_Error FLASH_Lyap( FLA_Trans trans, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale );

FLA_Error FLA_Lyap( FLA_Trans trans, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale );

FLA_Error FLA_Lyap_internal( FLA_Trans trans, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl );
FLA_Error FLA_Lyap_n( FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl );
FLA_Error FLA_Lyap_h( FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl );

// end FLA_Lyap.h
// begin FLA_Sylv.h


// begin FLA_Sylv_nn.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Sylv_nn_blk_var1( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nn_blk_var2( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nn_blk_var3( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nn_blk_var4( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nn_blk_var5( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nn_blk_var6( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nn_blk_var7( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nn_blk_var8( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nn_blk_var9( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nn_blk_var10( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nn_blk_var11( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nn_blk_var12( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nn_blk_var13( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nn_blk_var14( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nn_blk_var15( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nn_blk_var16( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nn_blk_var17( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nn_blk_var18( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );

FLA_Error FLA_Sylv_nn_opt_var1( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nn_opt_var2( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nn_opt_var3( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nn_opt_var4( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nn_opt_var5( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nn_opt_var6( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nn_opt_var7( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nn_opt_var8( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nn_opt_var9( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nn_opt_var10( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nn_opt_var11( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nn_opt_var12( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nn_opt_var13( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nn_opt_var14( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nn_opt_var15( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nn_opt_var16( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nn_opt_var17( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nn_opt_var18( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );

FLA_Error FLA_Sylv_nn_ops_var1( float sgn,
                                int m_C,
                                int n_C,
                                float* buff_A, int rs_A, int cs_A,
                                float* buff_B, int rs_B, int cs_B,
                                float* buff_C, int rs_C, int cs_C,
                                float* buff_scale,
                                int* info );
FLA_Error FLA_Sylv_nn_opd_var1( double sgn,
                                int m_C,
                                int n_C,
                                double* buff_A, int rs_A, int cs_A,
                                double* buff_B, int rs_B, int cs_B,
                                double* buff_C, int rs_C, int cs_C,
                                double* buff_scale,
                                int* info );
FLA_Error FLA_Sylv_nn_opc_var1( float sgn,
                                int m_C,
                                int n_C,
                                scomplex* buff_A, int rs_A, int cs_A,
                                scomplex* buff_B, int rs_B, int cs_B,
                                scomplex* buff_C, int rs_C, int cs_C,
                                scomplex* buff_scale,
                                int* info );
FLA_Error FLA_Sylv_nn_opz_var1( double sgn,
                                int m_C,
                                int n_C,
                                dcomplex* buff_A, int rs_A, int cs_A,
                                dcomplex* buff_B, int rs_B, int cs_B,
                                dcomplex* buff_C, int rs_C, int cs_C,
                                dcomplex* buff_scale,
                                int* info );
// end FLA_Sylv_nn.h
// begin FLA_Sylv_nh.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Sylv_nh_blk_var1( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nh_blk_var2( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nh_blk_var3( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nh_blk_var4( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nh_blk_var5( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nh_blk_var6( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nh_blk_var7( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nh_blk_var8( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nh_blk_var9( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nh_blk_var10( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nh_blk_var11( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nh_blk_var12( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nh_blk_var13( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nh_blk_var14( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nh_blk_var15( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nh_blk_var16( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nh_blk_var17( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nh_blk_var18( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );

FLA_Error FLA_Sylv_nh_opt_var1( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nh_opt_var2( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nh_opt_var3( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nh_opt_var4( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nh_opt_var5( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nh_opt_var6( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nh_opt_var7( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nh_opt_var8( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nh_opt_var9( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nh_opt_var10( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nh_opt_var11( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nh_opt_var12( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nh_opt_var13( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nh_opt_var14( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nh_opt_var15( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nh_opt_var16( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nh_opt_var17( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_nh_opt_var18( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );

FLA_Error FLA_Sylv_nh_ops_var1( float sgn,
                                int m_C,
                                int n_C,
                                float* buff_A, int rs_A, int cs_A,
                                float* buff_B, int rs_B, int cs_B,
                                float* buff_C, int rs_C, int cs_C,
                                float* buff_scale,
                                int* info );
FLA_Error FLA_Sylv_nh_opd_var1( double sgn,
                                int m_C,
                                int n_C,
                                double* buff_A, int rs_A, int cs_A,
                                double* buff_B, int rs_B, int cs_B,
                                double* buff_C, int rs_C, int cs_C,
                                double* buff_scale,
                                int* info );
FLA_Error FLA_Sylv_nh_opc_var1( float sgn,
                                int m_C,
                                int n_C,
                                scomplex* buff_A, int rs_A, int cs_A,
                                scomplex* buff_B, int rs_B, int cs_B,
                                scomplex* buff_C, int rs_C, int cs_C,
                                scomplex* buff_scale,
                                int* info );
FLA_Error FLA_Sylv_nh_opz_var1( double sgn,
                                int m_C,
                                int n_C,
                                dcomplex* buff_A, int rs_A, int cs_A,
                                dcomplex* buff_B, int rs_B, int cs_B,
                                dcomplex* buff_C, int rs_C, int cs_C,
                                dcomplex* buff_scale,
                                int* info );
// end FLA_Sylv_nh.h
// begin FLA_Sylv_hn.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Sylv_hn_blk_var1( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hn_blk_var2( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hn_blk_var3( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hn_blk_var4( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hn_blk_var5( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hn_blk_var6( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hn_blk_var7( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hn_blk_var8( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hn_blk_var9( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hn_blk_var10( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hn_blk_var11( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hn_blk_var12( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hn_blk_var13( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hn_blk_var14( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hn_blk_var15( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hn_blk_var16( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hn_blk_var17( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hn_blk_var18( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );

FLA_Error FLA_Sylv_hn_opt_var1( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hn_opt_var2( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hn_opt_var3( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hn_opt_var4( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hn_opt_var5( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hn_opt_var6( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hn_opt_var7( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hn_opt_var8( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hn_opt_var9( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hn_opt_var10( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hn_opt_var11( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hn_opt_var12( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hn_opt_var13( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hn_opt_var14( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hn_opt_var15( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hn_opt_var16( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hn_opt_var17( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hn_opt_var18( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );

FLA_Error FLA_Sylv_hn_ops_var1( float sgn,
                                int m_C,
                                int n_C,
                                float* buff_A, int rs_A, int cs_A,
                                float* buff_B, int rs_B, int cs_B,
                                float* buff_C, int rs_C, int cs_C,
                                float* buff_scale,
                                int* info );
FLA_Error FLA_Sylv_hn_opd_var1( double sgn,
                                int m_C,
                                int n_C,
                                double* buff_A, int rs_A, int cs_A,
                                double* buff_B, int rs_B, int cs_B,
                                double* buff_C, int rs_C, int cs_C,
                                double* buff_scale,
                                int* info );
FLA_Error FLA_Sylv_hn_opc_var1( float sgn,
                                int m_C,
                                int n_C,
                                scomplex* buff_A, int rs_A, int cs_A,
                                scomplex* buff_B, int rs_B, int cs_B,
                                scomplex* buff_C, int rs_C, int cs_C,
                                scomplex* buff_scale,
                                int* info );
FLA_Error FLA_Sylv_hn_opz_var1( double sgn,
                                int m_C,
                                int n_C,
                                dcomplex* buff_A, int rs_A, int cs_A,
                                dcomplex* buff_B, int rs_B, int cs_B,
                                dcomplex* buff_C, int rs_C, int cs_C,
                                dcomplex* buff_scale,
                                int* info );
// end FLA_Sylv_hn.h
// begin FLA_Sylv_hh.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Sylv_hh_blk_var1( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hh_blk_var2( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hh_blk_var3( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hh_blk_var4( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hh_blk_var5( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hh_blk_var6( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hh_blk_var7( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hh_blk_var8( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hh_blk_var9( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hh_blk_var10( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hh_blk_var11( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hh_blk_var12( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hh_blk_var13( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hh_blk_var14( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hh_blk_var15( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hh_blk_var16( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hh_blk_var17( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hh_blk_var18( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );

FLA_Error FLA_Sylv_hh_opt_var1( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hh_opt_var2( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hh_opt_var3( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hh_opt_var4( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hh_opt_var5( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hh_opt_var6( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hh_opt_var7( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hh_opt_var8( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hh_opt_var9( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hh_opt_var10( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hh_opt_var11( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hh_opt_var12( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hh_opt_var13( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hh_opt_var14( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hh_opt_var15( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hh_opt_var16( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hh_opt_var17( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLA_Sylv_hh_opt_var18( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );

FLA_Error FLA_Sylv_hh_ops_var1( float sgn,
                                int m_C,
                                int n_C,
                                float* buff_A, int rs_A, int cs_A,
                                float* buff_B, int rs_B, int cs_B,
                                float* buff_C, int rs_C, int cs_C,
                                float* buff_scale,
                                int* info );
FLA_Error FLA_Sylv_hh_opd_var1( double sgn,
                                int m_C,
                                int n_C,
                                double* buff_A, int rs_A, int cs_A,
                                double* buff_B, int rs_B, int cs_B,
                                double* buff_C, int rs_C, int cs_C,
                                double* buff_scale,
                                int* info );
FLA_Error FLA_Sylv_hh_opc_var1( float sgn,
                                int m_C,
                                int n_C,
                                scomplex* buff_A, int rs_A, int cs_A,
                                scomplex* buff_B, int rs_B, int cs_B,
                                scomplex* buff_C, int rs_C, int cs_C,
                                scomplex* buff_scale,
                                int* info );
FLA_Error FLA_Sylv_hh_opz_var1( double sgn,
                                int m_C,
                                int n_C,
                                dcomplex* buff_A, int rs_A, int cs_A,
                                dcomplex* buff_B, int rs_B, int cs_B,
                                dcomplex* buff_C, int rs_C, int cs_C,
                                dcomplex* buff_scale,
                                int* info );
// end FLA_Sylv_hh.h

FLA_Error FLA_Sylv_internal( FLA_Trans transa, FLA_Trans transb, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nn( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_nh( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hn( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );
FLA_Error FLA_Sylv_hh( FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl );

// end FLA_Sylv.h

// Miscellaneous
// begin FLA_Ttmm.h


// begin FLA_Ttmm_l.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Ttmm_l_blk_var1( FLA_Obj A, fla_ttmm_t* cntl );
FLA_Error FLA_Ttmm_l_blk_var2( FLA_Obj A, fla_ttmm_t* cntl );
FLA_Error FLA_Ttmm_l_blk_var3( FLA_Obj A, fla_ttmm_t* cntl );

FLA_Error FLA_Ttmm_l_unb_var1( FLA_Obj A );
FLA_Error FLA_Ttmm_l_unb_var2( FLA_Obj A );
FLA_Error FLA_Ttmm_l_unb_var3( FLA_Obj A );

FLA_Error FLA_Ttmm_l_opt_var1( FLA_Obj A );
FLA_Error FLA_Ttmm_l_ops_var1( int mn_A,
                               float*    A, int rs_A, int cs_A );
FLA_Error FLA_Ttmm_l_opd_var1( int mn_A,
                               double*   A, int rs_A, int cs_A );
FLA_Error FLA_Ttmm_l_opc_var1( int mn_A,
                               scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Ttmm_l_opz_var1( int mn_A,
                               dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Ttmm_l_opt_var2( FLA_Obj A );
FLA_Error FLA_Ttmm_l_ops_var2( int mn_A,
                               float*    A, int rs_A, int cs_A );
FLA_Error FLA_Ttmm_l_opd_var2( int mn_A,
                               double*   A, int rs_A, int cs_A );
FLA_Error FLA_Ttmm_l_opc_var2( int mn_A,
                               scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Ttmm_l_opz_var2( int mn_A,
                               dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Ttmm_l_opt_var3( FLA_Obj A );
FLA_Error FLA_Ttmm_l_ops_var3( int mn_A,
                               float*    A, int rs_A, int cs_A );
FLA_Error FLA_Ttmm_l_opd_var3( int mn_A,
                               double*   A, int rs_A, int cs_A );
FLA_Error FLA_Ttmm_l_opc_var3( int mn_A,
                               scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Ttmm_l_opz_var3( int mn_A,
                               dcomplex* A, int rs_A, int cs_A );

// end FLA_Ttmm_l.h
// begin FLA_Ttmm_u.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Ttmm_u_blk_var1( FLA_Obj A, fla_ttmm_t* cntl );
FLA_Error FLA_Ttmm_u_blk_var2( FLA_Obj A, fla_ttmm_t* cntl );
FLA_Error FLA_Ttmm_u_blk_var3( FLA_Obj A, fla_ttmm_t* cntl );

FLA_Error FLA_Ttmm_u_unb_var1( FLA_Obj A );
FLA_Error FLA_Ttmm_u_unb_var2( FLA_Obj A );
FLA_Error FLA_Ttmm_u_unb_var3( FLA_Obj A );

FLA_Error FLA_Ttmm_u_opt_var1( FLA_Obj A );
FLA_Error FLA_Ttmm_u_ops_var1( int mn_A,
                               float*    A, int rs_A, int cs_A );
FLA_Error FLA_Ttmm_u_opd_var1( int mn_A,
                               double*   A, int rs_A, int cs_A );
FLA_Error FLA_Ttmm_u_opc_var1( int mn_A,
                               scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Ttmm_u_opz_var1( int mn_A,
                               dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Ttmm_u_opt_var2( FLA_Obj A );
FLA_Error FLA_Ttmm_u_ops_var2( int mn_A,
                               float*    A, int rs_A, int cs_A );
FLA_Error FLA_Ttmm_u_opd_var2( int mn_A,
                               double*   A, int rs_A, int cs_A );
FLA_Error FLA_Ttmm_u_opc_var2( int mn_A,
                               scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Ttmm_u_opz_var2( int mn_A,
                               dcomplex* A, int rs_A, int cs_A );

FLA_Error FLA_Ttmm_u_opt_var3( FLA_Obj A );
FLA_Error FLA_Ttmm_u_ops_var3( int mn_A,
                               float*    A, int rs_A, int cs_A );
FLA_Error FLA_Ttmm_u_opd_var3( int mn_A,
                               double*   A, int rs_A, int cs_A );
FLA_Error FLA_Ttmm_u_opc_var3( int mn_A,
                               scomplex* A, int rs_A, int cs_A );
FLA_Error FLA_Ttmm_u_opz_var3( int mn_A,
                               dcomplex* A, int rs_A, int cs_A );

// end FLA_Ttmm_u.h

FLA_Error FLA_Ttmm_internal( FLA_Uplo uplo, FLA_Obj A, fla_ttmm_t* cntl );
FLA_Error FLA_Ttmm_l( FLA_Obj A, fla_ttmm_t* cntl );
FLA_Error FLA_Ttmm_u( FLA_Obj A, fla_ttmm_t* cntl );
// end FLA_Ttmm.h
// begin FLA_UDdate_UT.h


// begin FLA_UDdate_UT_vars.h


FLA_Error FLA_UDdate_UT_blk_var1( FLA_Obj R,
                                  FLA_Obj C,
                                  FLA_Obj D, FLA_Obj T, fla_uddateut_t* cntl );
FLA_Error FLA_UDdate_UT_blk_var2( FLA_Obj R,
                                  FLA_Obj C,
                                  FLA_Obj D, FLA_Obj T, fla_uddateut_t* cntl );

FLA_Error FLA_UDdate_UT_unb_var1( FLA_Obj R,
                                  FLA_Obj C,
                                  FLA_Obj D, FLA_Obj T );

FLA_Error FLA_UDdate_UT_opt_var1( FLA_Obj R,
                                  FLA_Obj C,
                                  FLA_Obj D, FLA_Obj T );
FLA_Error FLA_UDdate_UT_ops_var1( int mn_RT,
                                  int m_C,
                                  int m_D,
                                  float* R, int rs_R, int cs_R,
                                  float* C, int rs_C, int cs_C,
                                  float* D, int rs_D, int cs_D,
                                  float* T, int rs_T, int cs_T );
FLA_Error FLA_UDdate_UT_opd_var1( int mn_RT,
                                  int m_C,
                                  int m_D,
                                  double* R, int rs_R, int cs_R,
                                  double* C, int rs_C, int cs_C,
                                  double* D, int rs_D, int cs_D,
                                  double* T, int rs_T, int cs_T );
FLA_Error FLA_UDdate_UT_opc_var1( int mn_RT,
                                  int m_C,
                                  int m_D,
                                  scomplex* R, int rs_R, int cs_R,
                                  scomplex* C, int rs_C, int cs_C,
                                  scomplex* D, int rs_D, int cs_D,
                                  scomplex* T, int rs_T, int cs_T );
FLA_Error FLA_UDdate_UT_opz_var1( int mn_RT,
                                  int m_C,
                                  int m_D,
                                  dcomplex* R, int rs_R, int cs_R,
                                  dcomplex* C, int rs_C, int cs_C,
                                  dcomplex* D, int rs_D, int cs_D,
                                  dcomplex* T, int rs_T, int cs_T );

// end FLA_UDdate_UT_vars.h

FLA_Error FLA_UDdate_UT( FLA_Obj R,
                         FLA_Obj C,
                         FLA_Obj D, FLA_Obj T );

FLA_Error FLA_UDdate_UT_internal( FLA_Obj R,
                                  FLA_Obj C,
                                  FLA_Obj D, FLA_Obj T, fla_uddateut_t* cntl );

FLA_Error FLA_UDdate_UT_create_T( FLA_Obj R, FLA_Obj* T );

FLA_Error FLA_UDdate_UT_update_rhs( FLA_Obj T, FLA_Obj bR,
                                    FLA_Obj C, FLA_Obj bC,
                                    FLA_Obj D, FLA_Obj bD );

FLA_Error FLA_UDdate_UT_solve( FLA_Obj R, FLA_Obj bR, FLA_Obj x );
// end FLA_UDdate_UT.h
// begin FLA_UDdate_UT_inc.h


FLA_Error FLASH_UDdate_UT_inc( FLA_Obj R,
                               FLA_Obj C,
                               FLA_Obj D, FLA_Obj T, FLA_Obj W );

FLA_Error FLA_UDdate_UT_inc_blk_var1( FLA_Obj R,
                                      FLA_Obj C,
                                      FLA_Obj D, FLA_Obj T, FLA_Obj W, fla_uddateutinc_t* cntl );

FLA_Error FLASH_UDdate_UT_inc_create_hier_matrices( FLA_Obj R_flat, FLA_Obj C_flat, FLA_Obj D_flat, dim_t depth, dim_t* b_flash, dim_t b_alg, FLA_Obj* R, FLA_Obj* C, FLA_Obj* D, FLA_Obj* T, FLA_Obj* W );
dim_t     FLASH_UDdate_UT_inc_determine_alg_blocksize( FLA_Obj R );

FLA_Error FLASH_UDdate_UT_inc_update_rhs( FLA_Obj T, FLA_Obj bR,
                                          FLA_Obj C, FLA_Obj bC,
                                          FLA_Obj D, FLA_Obj bD );
FLA_Error FLASH_UDdate_UT_inc_solve( FLA_Obj R, FLA_Obj bR, FLA_Obj x );
// end FLA_UDdate_UT_inc.h

// Utility
// begin FLA_Accum_T_UT.h


// begin FLA_Accum_T_UT_fc.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Accum_T_UT_fc_unb_var1( FLA_Obj A, FLA_Obj t, FLA_Obj T );
FLA_Error FLA_Accum_T_UT_fc_blk_var2( FLA_Obj A, FLA_Obj t, FLA_Obj T );

FLA_Error FLA_Accum_T_UT_fc_opt_var1( FLA_Obj A, FLA_Obj t, FLA_Obj T );

FLA_Error FLA_Accum_T_UT_fc_ops_var1( int m_A,
                                      int n_AT,
                                      float* A, int rs_A, int cs_A,
                                      int m_t, 
                                      float* t, int inc_t,
                                      float* T, int rs_T, int cs_T );
FLA_Error FLA_Accum_T_UT_fc_opd_var1( int m_A,
                                      int n_AT,
                                      double* A, int rs_A, int cs_A,
                                      int m_t, 
                                      double* t, int inc_t,
                                      double* T, int rs_T, int cs_T );
FLA_Error FLA_Accum_T_UT_fc_opc_var1( int m_A,
                                      int n_AT,
                                      scomplex* A, int rs_A, int cs_A,
                                      int m_t, 
                                      scomplex* t, int inc_t,
                                      scomplex* T, int rs_T, int cs_T );
FLA_Error FLA_Accum_T_UT_fc_opz_var1( int m_A,
                                      int n_AT,
                                      dcomplex* A, int rs_A, int cs_A,
                                      int m_t, 
                                      dcomplex* t, int inc_t,
                                      dcomplex* T, int rs_T, int cs_T );
// end FLA_Accum_T_UT_fc.h
// begin FLA_Accum_T_UT_fr.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Accum_T_UT_fr_unb_var1( FLA_Obj A, FLA_Obj t, FLA_Obj T );
FLA_Error FLA_Accum_T_UT_fr_blk_var2( FLA_Obj A, FLA_Obj t, FLA_Obj T );

FLA_Error FLA_Accum_T_UT_fr_opt_var1( FLA_Obj A, FLA_Obj t, FLA_Obj T );

FLA_Error FLA_Accum_T_UT_fr_ops_var1( int m_A,
                                      int n_A,
                                      float* A, int rs_A, int cs_A,
                                      int m_t,
                                      float* t, int inc_t,
                                      float* T, int rs_T, int cs_T );
FLA_Error FLA_Accum_T_UT_fr_opd_var1( int m_A,
                                      int n_A,
                                      double* A, int rs_A, int cs_A,
                                      int m_t,
                                      double* t, int inc_t,
                                      double* T, int rs_T, int cs_T );
FLA_Error FLA_Accum_T_UT_fr_opc_var1( int m_A,
                                      int n_A,
                                      scomplex* A, int rs_A, int cs_A,
                                      int m_t,
                                      scomplex* t, int inc_t,
                                      scomplex* T, int rs_T, int cs_T );
FLA_Error FLA_Accum_T_UT_fr_opz_var1( int m_A,
                                      int n_A,
                                      dcomplex* A, int rs_A, int cs_A,
                                      int m_t,
                                      dcomplex* t, int inc_t,
                                      dcomplex* T, int rs_T, int cs_T );
// end FLA_Accum_T_UT_fr.h

FLA_Error FLA_Accum_T_UT_internal( FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj tau, FLA_Obj T );
// end FLA_Accum_T_UT.h
// begin FLA_Apply_G.h


// begin FLA_Apply_G_lf.h


FLA_Error FLA_Apply_G_lf_opt_var1( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_lf_blk_var3( FLA_Obj G, FLA_Obj A, dim_t b_alg );
// end FLA_Apply_G_lf.h
// begin FLA_Apply_G_lb.h


FLA_Error FLA_Apply_G_lb_opt_var1( FLA_Obj c, FLA_Obj s, FLA_Obj A );
FLA_Error FLA_Apply_G_lb_ops_var1( int       m_A,
                                   int       n_A,
                                   float*    buff_c, int inc_c,
                                   float*    buff_s, int inc_s,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_lb_opd_var1( int       m_A,
                                   int       n_A,
                                   double*   buff_c, int inc_c,
                                   double*   buff_s, int inc_s,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_lb_opc_var1( int       m_A,
                                   int       n_A,
                                   float*    buff_c, int inc_c,
                                   float*    buff_s, int inc_s,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_lb_opz_var1( int       m_A,
                                   int       n_A,
                                   double*   buff_c, int inc_c,
                                   double*   buff_s, int inc_s,
                                   dcomplex* buff_A, int rs_A, int cs_A );

// end FLA_Apply_G_lb.h
// begin FLA_Apply_G_rf.h


// Variant 1

FLA_Error FLA_Apply_G_rf_opt_var1( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ops_var1( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opd_var1( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opc_var1( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opz_var1( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_asm_var1( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ass_var1( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asd_var1( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asc_var1( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asz_var1( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_blk_var1( FLA_Obj G, FLA_Obj A, dim_t b_alg );
FLA_Error FLA_Apply_G_rf_bls_var1( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_bld_var1( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blc_var1( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blz_var1( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );

// Variant 2

FLA_Error FLA_Apply_G_rf_opt_var2( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ops_var2( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opd_var2( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opc_var2( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opz_var2( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_asm_var2( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ass_var2( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asd_var2( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asc_var2( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asz_var2( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_blk_var2( FLA_Obj G, FLA_Obj A, dim_t b_alg );
FLA_Error FLA_Apply_G_rf_bls_var2( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_bld_var2( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blc_var2( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blz_var2( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );

// Variant 3

FLA_Error FLA_Apply_G_rf_opt_var3( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ops_var3( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opd_var3( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opc_var3( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opz_var3( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_asm_var3( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ass_var3( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asd_var3( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asc_var3( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asz_var3( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_blk_var3( FLA_Obj G, FLA_Obj A, dim_t b_alg );
FLA_Error FLA_Apply_G_rf_bls_var3( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_bld_var3( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blc_var3( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blz_var3( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );

// Variant 4

FLA_Error FLA_Apply_G_rf_opt_var4( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ops_var4( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opd_var4( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opc_var4( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opz_var4( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_asm_var4( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ass_var4( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asd_var4( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asc_var4( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asz_var4( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_blk_var4( FLA_Obj G, FLA_Obj A, dim_t b_alg );
FLA_Error FLA_Apply_G_rf_bls_var4( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_bld_var4( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blc_var4( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blz_var4( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );

// Variant 5

FLA_Error FLA_Apply_G_rf_opt_var5( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ops_var5( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opd_var5( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opc_var5( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opz_var5( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_asm_var5( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ass_var5( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asd_var5( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asc_var5( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asz_var5( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_blk_var5( FLA_Obj G, FLA_Obj A, dim_t b_alg );
FLA_Error FLA_Apply_G_rf_bls_var5( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_bld_var5( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blc_var5( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blz_var5( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );

// Variant 6

FLA_Error FLA_Apply_G_rf_opt_var6( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ops_var6( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opd_var6( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opc_var6( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opz_var6( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_asm_var6( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ass_var6( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asd_var6( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asc_var6( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asz_var6( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_blk_var6( FLA_Obj G, FLA_Obj A, dim_t b_alg );
FLA_Error FLA_Apply_G_rf_bls_var6( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_bld_var6( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blc_var6( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blz_var6( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );

// Variant 7

FLA_Error FLA_Apply_G_rf_opt_var7( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ops_var7( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opd_var7( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opc_var7( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opz_var7( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_asm_var7( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ass_var7( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asd_var7( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asc_var7( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asz_var7( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_blk_var7( FLA_Obj G, FLA_Obj A, dim_t b_alg );
FLA_Error FLA_Apply_G_rf_bls_var7( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_bld_var7( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blc_var7( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blz_var7( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );

// Variant 8

FLA_Error FLA_Apply_G_rf_opt_var8( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ops_var8( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opd_var8( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opc_var8( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opz_var8( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_asm_var8( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ass_var8( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asd_var8( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asc_var8( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asz_var8( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_blk_var8( FLA_Obj G, FLA_Obj A, dim_t b_alg );
FLA_Error FLA_Apply_G_rf_bls_var8( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_bld_var8( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blc_var8( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blz_var8( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );

// Variant 9

FLA_Error FLA_Apply_G_rf_opt_var9( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ops_var9( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opd_var9( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opc_var9( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_opz_var9( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_asm_var9( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ass_var9( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asd_var9( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asc_var9( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asz_var9( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_blk_var9( FLA_Obj G, FLA_Obj A, dim_t b_alg );
FLA_Error FLA_Apply_G_rf_bls_var9( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_bld_var9( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blc_var9( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blz_var9( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );







// Variant 3b

FLA_Error FLA_Apply_G_rf_asm_var3b( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ass_var3b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asd_var3b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asc_var3b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asz_var3b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_blk_var3b( FLA_Obj G, FLA_Obj A, dim_t b_alg );
FLA_Error FLA_Apply_G_rf_bls_var3b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_bld_var3b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blc_var3b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blz_var3b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );


// Variant 5b

FLA_Error FLA_Apply_G_rf_asm_var5b( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ass_var5b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asd_var5b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asc_var5b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asz_var5b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_blk_var5b( FLA_Obj G, FLA_Obj A, dim_t b_alg );
FLA_Error FLA_Apply_G_rf_bls_var5b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_bld_var5b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blc_var5b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blz_var5b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );


// Variant 6b

FLA_Error FLA_Apply_G_rf_asm_var6b( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ass_var6b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asd_var6b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asc_var6b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asz_var6b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_blk_var6b( FLA_Obj G, FLA_Obj A, dim_t b_alg );
FLA_Error FLA_Apply_G_rf_bls_var6b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_bld_var6b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blc_var6b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blz_var6b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );


// Variant 8b

FLA_Error FLA_Apply_G_rf_asm_var8b( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ass_var8b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asd_var8b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asc_var8b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asz_var8b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_blk_var8b( FLA_Obj G, FLA_Obj A, dim_t b_alg );
FLA_Error FLA_Apply_G_rf_bls_var8b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_bld_var8b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blc_var8b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blz_var8b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );

FLA_Error FLA_Apply_G_rf_bhs_var3( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_bhd_var3( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_bhc_var3( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_bhz_var3( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   FLA_Obj*  buff_A, int rs_A, int cs_A,
                                   int       b_alg );


// Variant 9b

FLA_Error FLA_Apply_G_rf_asm_var9b( FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_rf_ass_var9b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asd_var9b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asc_var9b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rf_asz_var9b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   int       iTL,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A );

FLA_Error FLA_Apply_G_rf_blk_var9b( FLA_Obj G, FLA_Obj A, dim_t b_alg );
FLA_Error FLA_Apply_G_rf_bls_var9b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   float*    buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_bld_var9b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   double*   buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blc_var9b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   scomplex* buff_G, int rs_G, int cs_G,
                                   scomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );
FLA_Error FLA_Apply_G_rf_blz_var9b( int       k_G,
                                   int       m_A,
                                   int       n_A,
                                   int       i_k,
                                   dcomplex* buff_G, int rs_G, int cs_G,
                                   dcomplex* buff_A, int rs_A, int cs_A,
                                   int       b_alg );


// end FLA_Apply_G_rf.h
// begin FLA_Apply_G_rb.h


FLA_Error FLA_Apply_G_rb_opt_var1( FLA_Obj c, FLA_Obj s, FLA_Obj A );
FLA_Error FLA_Apply_G_rb_ops_var1( int       m_A,
                                   int       n_A,
                                   float*    buff_c, int inc_c,
                                   float*    buff_s, int inc_s,
                                   float*    buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rb_opd_var1( int       m_A,
                                   int       n_A,
                                   double*   buff_c, int inc_c,
                                   double*   buff_s, int inc_s,
                                   double*   buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rb_opc_var1( int       m_A,
                                   int       n_A,
                                   float*    buff_c, int inc_c,
                                   float*    buff_s, int inc_s,
                                   scomplex* buff_A, int rs_A, int cs_A );
FLA_Error FLA_Apply_G_rb_opz_var1( int       m_A,
                                   int       n_A,
                                   double*   buff_c, int inc_c,
                                   double*   buff_s, int inc_s,
                                   dcomplex* buff_A, int rs_A, int cs_A );

// end FLA_Apply_G_rb.h

FLA_Error FLA_Apply_G( FLA_Side side, FLA_Direct direct, FLA_Obj G, FLA_Obj A );
FLA_Error FLA_Apply_G_internal( FLA_Side side, FLA_Direct direct, FLA_Obj G, FLA_Obj A );

// begin FLA_Givens2.h


FLA_Error FLA_Givens2( FLA_Obj chi_1, FLA_Obj chi_2, FLA_Obj gamma, FLA_Obj sigma, FLA_Obj chi_1_new );
FLA_Error FLA_Givens2_ops( float*  chi_1,
                           float*  chi_2,
                           float*  gamma,
                           float*  sigma,
                           float*  chi_1_new );
FLA_Error FLA_Givens2_opd( double* chi_1,
                           double* chi_2,
                           double* gamma,
                           double* sigma,
                           double* chi_1_new );
#define MAC_Givens2_ops( chi_1, chi_2, gamma, sigma, chi_1_new ) \
{ \
	float  chi_1_orig = *(chi_1); \
	float  chi_2_orig = *(chi_2); \
	float  g, s; \
	float  norm_x; \
\
	norm_x = ( float  ) sqrt( ( float ) ( chi_1_orig * chi_1_orig + \
	                                      chi_2_orig * chi_2_orig ) ); \
\
	g = chi_1_orig / norm_x; \
	s = chi_2_orig / norm_x; \
\
	if ( fabs( chi_1_orig ) > fabs( chi_2_orig ) && g < 0.0F ) \
	{ \
		g      = -g; \
		s      = -s; \
		norm_x = -norm_x; \
	} \
\
	*(gamma)     = g; \
	*(sigma)     = s; \
	*(chi_1_new) = norm_x; \
\
}

#define MAC_Givens2_opd( chi_1, chi_2, gamma, sigma, chi_1_new ) \
{ \
	double chi_1_orig = *(chi_1); \
	double chi_2_orig = *(chi_2); \
	double g, s; \
	double norm_x; \
\
	norm_x = ( double ) sqrt( chi_1_orig * chi_1_orig + \
	                          chi_2_orig * chi_2_orig ); \
\
	g = chi_1_orig / norm_x; \
	s = chi_2_orig / norm_x; \
\
	if ( fabs( chi_1_orig ) > fabs( chi_2_orig ) && g < 0.0 ) \
	{ \
		g      = -g; \
		s      = -s; \
		norm_x = -norm_x; \
	} \
\
	*(gamma)     = g; \
	*(sigma)     = s; \
	*(chi_1_new) = norm_x; \
\
}

// end FLA_Givens2.h

// begin FLA_Apply_GTG.h


FLA_Error FLA_Apply_GTG( FLA_Obj gamma, FLA_Obj sigma, FLA_Obj delta1, FLA_Obj epsilon1, FLA_Obj delta2 );
FLA_Error FLA_Apply_GTG_ops( float*  gamma,
                             float*  sigma,
                             float*  delta1,
                             float*  epsilon1,
                             float*  delta2 );
FLA_Error FLA_Apply_GTG_opd( double* gamma,
                             double* sigma,
                             double* delta1,
                             double* epsilon1,
                             double* delta2 );

#define MAC_Apply_GTG_ops( gamma, sigma, delta1, epsilon, delta2 ) \
{ \
	float  g, s; \
	float  d1, e, d2; \
	float  g2, s2, tgse; \
\
	g = *(gamma); \
	s = *(sigma); \
\
	d1 = *(delta1); \
	e  = *(epsilon); \
	d2 = *(delta2); \
\
	g2 = g * g; \
	s2 = s * s; \
	tgse = 2.0 * g * s * e; \
\
	*(delta1)  = g2 * d1 + tgse + s2 * d2; \
	*(delta2)  = s2 * d1 - tgse + g2 * d2; \
	*(epsilon) = g * s * (d2 - d1) + e * (g2 - s2); \
}

#define MAC_Apply_GTG_opd( gamma, sigma, delta1, epsilon, delta2 ) \
{ \
 \
	double g, s; \
	double d1, e, d2; \
	double g2, s2, tgse; \
\
	g = *(gamma); \
	s = *(sigma); \
\
	d1 = *(delta1); \
	e  = *(epsilon); \
	d2 = *(delta2); \
\
	g2 = g * g; \
	s2 = s * s; \
	tgse = 2.0 * g * s * e; \
\
	*(delta1)  = g2 * d1 + tgse + s2 * d2; \
	*(delta2)  = s2 * d1 - tgse + g2 * d2; \
	*(epsilon) = g * s * (d2 - d1) + e * (g2 - s2); \
\
 \
}

// end FLA_Apply_GTG.h

// begin FLA_Apply_GT_2x2.h


#define MAC_Apply_GT_2x2_ops( gamma, sigma, epsilon1, delta2, beta, epsilon2 ) \
{ \
	float g, s; \
	float e1, d2, e2; \
\
	g = *(gamma); \
	s = *(sigma); \
\
	e1 = *(epsilon1); \
	d2 = *(delta2); \
	e2 = *(epsilon2); \
\
	*(epsilon1)  =  g * e1 + s * d2; \
	*(delta2)    = -s * e1 + g * d2; \
\
	*(beta)      = s * e2; \
	*(epsilon2)  = g * e2; \
}

#define MAC_Apply_GT_2x2_opd( gamma, sigma, epsilon1, delta2, beta, epsilon2 ) \
{ \
	double g, s; \
	double e1, d2, e2; \
\
	g = *(gamma); \
	s = *(sigma); \
\
	e1 = *(epsilon1); \
	d2 = *(delta2); \
	e2 = *(epsilon2); \
\
	*(epsilon1)  =  g * e1 + s * d2; \
	*(delta2)    = -s * e1 + g * d2; \
\
	*(beta)      = s * e2; \
	*(epsilon2)  = g * e2; \
}

#define MAC_Apply_GT_2x1_ops( gamma, sigma, epsilon1, delta2 ) \
{ \
	float g, s; \
	float e1, d2; \
\
	g = *(gamma); \
	s = *(sigma); \
\
	e1 = *(epsilon1); \
	d2 = *(delta2); \
\
	*(epsilon1)  =  g * e1 + s * d2; \
	*(delta2)    = -s * e1 + g * d2; \
}

#define MAC_Apply_GT_2x1_opd( gamma, sigma, epsilon1, delta2 ) \
{ \
	double g, s; \
	double e1, d2; \
\
	g = *(gamma); \
	s = *(sigma); \
\
	e1 = *(epsilon1); \
	d2 = *(delta2); \
\
	*(epsilon1)  =  g * e1 + s * d2; \
	*(delta2)    = -s * e1 + g * d2; \
}

// end FLA_Apply_GT_2x2.h
// begin FLA_Apply_G_2x2.h


#define MAC_Apply_G_2x2_ops( gamma, sigma, delta1, beta, epsilon1, delta2 ) \
{ \
	float g, s; \
	float d1, e1, d2; \
\
	g = *(gamma); \
	s = *(sigma); \
\
	d1 = *(delta1); \
	e1 = *(epsilon1); \
	d2 = *(delta2); \
\
	*(delta1)    =  g * d1 + s * e1; \
	*(epsilon1)  = -s * d1 + g * e1; \
\
	*(beta)      = s * d2; \
	*(delta2)    = g * d2; \
}

#define MAC_Apply_G_2x2_opd( gamma, sigma, delta1, beta, epsilon1, delta2 ) \
{ \
	double g, s; \
	double d1, e1, d2; \
\
	g = *(gamma); \
	s = *(sigma); \
\
	d1 = *(delta1); \
	e1 = *(epsilon1); \
	d2 = *(delta2); \
\
	*(delta1)    =  g * d1 + s * e1; \
	*(epsilon1)  = -s * d1 + g * e1; \
\
	*(beta)      = s * d2; \
	*(delta2)    = g * d2; \
}

// end FLA_Apply_G_2x2.h
// begin FLA_Apply_G_1x2.h


#define MAC_Apply_G_1x2_ops( gamma, sigma, beta, epsilon ) \
{ \
	*(beta)    = *(epsilon) * *(sigma); \
	*(epsilon) = *(epsilon) * *(gamma); \
}

#define MAC_Apply_G_1x2_opd( gamma, sigma, beta, epsilon ) \
{ \
	*(beta)    = *(epsilon) * *(sigma); \
	*(epsilon) = *(epsilon) * *(gamma); \
}

// end FLA_Apply_G_1x2.h

// begin FLA_Apply_G_mx2_opt.h


#define MAC_Apply_G_mx2_ops( m_A, \
                             gamma12, \
                             sigma12, \
                             a1, inc_a1, \
                             a2, inc_a2 ) \
{ \
	float             ga     = *gamma12; \
	float             si     = *sigma12; \
	float*  restrict  alpha1 = a1; \
	float*  restrict  alpha2 = a2; \
	float             temp1; \
	float             temp2; \
	int               i; \
\
	for ( i = 0; i < m_A; ++i ) \
	{ \
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		*alpha1 =  ga * temp1 + si * temp2; \
		*alpha2 = -si * temp1 + ga * temp2; \
\
		alpha1 += inc_a1; \
		alpha2 += inc_a2; \
	} \
}

#define MAC_Apply_G_mx2_opc( m_A, \
                             gamma12, \
                             sigma12, \
                             a1, inc_a1, \
                             a2, inc_a2 ) \
{ \
	float              ga12   = *gamma12; \
	float              si12   = *sigma12; \
	scomplex* restrict alpha1 = a1; \
	scomplex* restrict alpha2 = a2; \
	scomplex           temp1; \
	scomplex           temp2; \
	int                i; \
\
	for ( i = 0; i < m_A; ++i ) \
	{ \
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		alpha1->real =  ga12 * temp1.real + si12 * temp2.real; \
		alpha1->imag =  ga12 * temp1.imag + si12 * temp2.imag; \
\
		alpha2->real = -si12 * temp1.real + ga12 * temp2.real; \
		alpha2->imag = -si12 * temp1.imag + ga12 * temp2.imag; \
\
		alpha1 += inc_a1; \
		alpha2 += inc_a2; \
	} \
}

#define MAC_Apply_G_mx2_opd( m_A, \
                             gamma12, \
                             sigma12, \
                             a1, inc_a1, \
                             a2, inc_a2 ) \
{ \
	double            ga     = *gamma12; \
	double            si     = *sigma12; \
	double* restrict  alpha1 = a1; \
	double* restrict  alpha2 = a2; \
	double            temp1; \
	double            temp2; \
	int               i; \
\
	for ( i = 0; i < m_A; ++i ) \
	{ \
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		*alpha1 =  ga * temp1 + si * temp2; \
		*alpha2 = -si * temp1 + ga * temp2; \
\
		alpha1 += inc_a1; \
		alpha2 += inc_a2; \
	} \
}

#define MAC_Apply_G_mx2_opz( m_A, \
                             gamma12, \
                             sigma12, \
                             a1, inc_a1, \
                             a2, inc_a2 ) \
{\
	double             ga12   = *gamma12; \
	double             si12   = *sigma12; \
	dcomplex* restrict alpha1 = a1; \
	dcomplex* restrict alpha2 = a2; \
	dcomplex           temp1; \
	dcomplex           temp2; \
	int                i; \
\
	for ( i = 0; i < m_A; ++i ) \
	{ \
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		alpha1->real =  ga12 * temp1.real + si12 * temp2.real; \
		alpha1->imag =  ga12 * temp1.imag + si12 * temp2.imag; \
\
		alpha2->real = -si12 * temp1.real + ga12 * temp2.real; \
		alpha2->imag = -si12 * temp1.imag + ga12 * temp2.imag; \
\
		alpha1 += inc_a1; \
		alpha2 += inc_a2; \
	} \
}

// end FLA_Apply_G_mx2_opt.h
// begin FLA_Apply_G_mx2_asm.h



#if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS

#define MAC_Apply_G_mx2_ass MAC_Apply_G_mx2_ops
#define MAC_Apply_G_mx2_asd MAC_Apply_G_mx2_opd
#define MAC_Apply_G_mx2_asc MAC_Apply_G_mx2_opc
#define MAC_Apply_G_mx2_asz MAC_Apply_G_mx2_opz

#elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS

#define MAC_Apply_G_mx2_ass( m_A, \
                             gamma12, \
                             sigma12, \
                             a1, inc_a1, \
                             a2, inc_a2 ) \
{\
	int              n_iter32  = m_A / ( 4 * 8 ); \
	int              n_left32  = m_A % ( 4 * 8 ); \
	int              n_iter4   = n_left32 / ( 4 * 1 ); \
	int              n_left    = n_left32 % ( 4 * 1 ); \
	int              i; \
\
	const int        step_a1 = inc_a1 * 4; \
	const int        step_a2 = inc_a2 * 4; \
\
	float*  restrict alpha1 = a1; \
	float*  restrict alpha2 = a2; \
\
	v4sf_t           a1v, a2v; \
	v4sf_t           g12v, s12v; \
	v4sf_t           t1v; \
\
	g12v.v = _mm_load1_ps( gamma12 ); \
	s12v.v = _mm_load1_ps( sigma12 ); \
\
	for ( i = 0; i < n_iter32; ++i ) \
	{ \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
	} \
\
	for ( i = 0; i < n_iter4; ++i ) \
	{ \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
	} \
\
	for ( i = 0; i < n_left; ++i ) \
	{ \
		float ga12 = *gamma12; \
		float si12 = *sigma12; \
		float temp1; \
		float temp2; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		*alpha1 = temp1 * ga12 + temp2 * si12; \
		*alpha2 = temp2 * ga12 - temp1 * si12; \
\
		alpha1 += 1; \
		alpha2 += 1; \
	} \
}

#define MAC_Apply_G_mx2_asd( m_A, \
                             gamma12, \
                             sigma12, \
                             a1, inc_a1, \
                             a2, inc_a2 ) \
{\
	int              n_iter16  = m_A / ( 2 * 8 ); \
	int              n_left16  = m_A % ( 2 * 8 ); \
	int              n_iter2   = n_left16 / ( 2 * 1 ); \
	int              n_left    = n_left16 % ( 2 * 1 ); \
	int              i; \
\
	const int        step_a1 = inc_a1 * 2; \
	const int        step_a2 = inc_a2 * 2; \
\
	double* restrict alpha1 = a1; \
	double* restrict alpha2 = a2; \
\
	v2df_t           a1v, a2v; \
	v2df_t           g12v, s12v; \
	v2df_t           t1v; \
\
	g12v.v = _mm_loaddup_pd( gamma12 ); \
	s12v.v = _mm_loaddup_pd( sigma12 ); \
\
	for ( i = 0; i < n_iter16; ++i ) \
	{ \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
	} \
\
	for ( i = 0; i < n_iter2; ++i ) \
	{ \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
	} \
\
	if ( n_left == 1 ) \
	{ \
		double ga12 = *gamma12; \
		double si12 = *sigma12; \
		double temp1; \
		double temp2; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		*alpha1 = temp1 * ga12 + temp2 * si12; \
		*alpha2 = temp2 * ga12 - temp1 * si12; \
	} \
}

#define MAC_Apply_G_mx2_asc( m_A, \
                             gamma12, \
                             sigma12, \
                             a1, inc_a1, \
                             a2, inc_a2 ) \
{\
	int                n_iter16  = m_A / ( 2 * 8 ); \
	int                n_left16  = m_A % ( 2 * 8 ); \
	int                n_iter2   = n_left16 / ( 2 * 1 ); \
	int                n_left    = n_left16 % ( 2 * 1 ); \
	int                i; \
\
	const int          step_a1 = inc_a1 * 2; \
	const int          step_a2 = inc_a2 * 2; \
\
	scomplex* restrict alpha1 = a1; \
	scomplex* restrict alpha2 = a2; \
\
	v4sf_t             a1v, a2v; \
	v4sf_t             g12v, s12v; \
	v4sf_t             t1v; \
\
	g12v.v = _mm_load1_ps( gamma12 ); \
	s12v.v = _mm_load1_ps( sigma12 ); \
\
	for ( i = 0; i < n_iter16; ++i ) \
	{ \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
	} \
\
	for ( i = 0; i < n_iter2; ++i ) \
	{ \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
	} \
\
	if ( n_left == 1 ) \
	{ \
		float    ga12 = *gamma12; \
		float    si12 = *sigma12; \
		scomplex temp1; \
		scomplex temp2; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		alpha1->real = temp1.real * ga12 + temp2.real * si12; \
		alpha2->real = temp2.real * ga12 - temp1.real * si12; \
\
		alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \
		alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \
	} \
}

#define MAC_Apply_G_mx2_asz( m_A, \
                             gamma12, \
                             sigma12, \
                             a1, inc_a1, \
                             a2, inc_a2 ) \
{\
	int                n_iter  = m_A / 8; \
	int                n_left  = m_A % 8; \
	int                i; \
\
	const int          step_a1 = inc_a1 * 1; \
	const int          step_a2 = inc_a2 * 1; \
\
	dcomplex* restrict alpha1 = a1; \
	dcomplex* restrict alpha2 = a2; \
\
	v2df_t             a1v, a2v; \
	v2df_t             g12v, s12v; \
	v2df_t             t1v; \
\
	g12v.v = _mm_loaddup_pd( gamma12 ); \
	s12v.v = _mm_loaddup_pd( sigma12 ); \
\
	for ( i = 0; i < n_iter; ++i ) \
	{ \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
	} \
\
	for ( i = 0; i < n_left; ++i ) \
	{ \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
\
		alpha1 += step_a1; \
		alpha2 += step_a2; \
	} \
}

#endif
// end FLA_Apply_G_mx2_asm.h

// begin FLA_Apply_G_mx3_opt.h


#define MAC_Apply_G_mx3_ops( m_A, \
                             gamma12, \
                             sigma12, \
                             gamma23, \
                             sigma23, \
                             a1, inc_a1, \
                             a2, inc_a2, \
                             a3, inc_a3 ) \
{ \
	float              ga12   = *gamma12; \
	float              si12   = *sigma12; \
	float              ga23   = *gamma23; \
	float              si23   = *sigma23; \
	float*    restrict alpha1 = a1; \
	float*    restrict alpha2 = a2; \
	float*    restrict alpha3 = a3; \
	float              temp1; \
	float              temp2; \
	float              temp3; \
	int                i; \
\
	for ( i = 0; i < m_A; ++i ) \
	{ \
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		*alpha1 = temp1 * ga12 + temp2 * si12; \
		*alpha2 = temp2 * ga12 - temp1 * si12; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		*alpha2 = temp2 * ga23 + temp3 * si23; \
		*alpha3 = temp3 * ga23 - temp2 * si23; \
\
		alpha1 += inc_a1; \
		alpha2 += inc_a2; \
		alpha3 += inc_a3; \
	} \
}

#define MAC_Apply_G_mx3_opd( m_A, \
                             gamma12, \
                             sigma12, \
                             gamma23, \
                             sigma23, \
                             a1, inc_a1, \
                             a2, inc_a2, \
                             a3, inc_a3 ) \
{ \
	double             ga12   = *gamma12; \
	double             si12   = *sigma12; \
	double             ga23   = *gamma23; \
	double             si23   = *sigma23; \
	double*   restrict alpha1 = a1; \
	double*   restrict alpha2 = a2; \
	double*   restrict alpha3 = a3; \
	double             temp1; \
	double             temp2; \
	double             temp3; \
	int                i; \
\
	for ( i = 0; i < m_A; ++i ) \
	{ \
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		*alpha1 = temp1 * ga12 + temp2 * si12; \
		*alpha2 = temp2 * ga12 - temp1 * si12; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		*alpha2 = temp2 * ga23 + temp3 * si23; \
		*alpha3 = temp3 * ga23 - temp2 * si23; \
\
		alpha1 += inc_a1; \
		alpha2 += inc_a2; \
		alpha3 += inc_a3; \
	} \
}

#define MAC_Apply_G_mx3_opc( m_A, \
                             gamma12, \
                             sigma12, \
                             gamma23, \
                             sigma23, \
                             a1, inc_a1, \
                             a2, inc_a2, \
                             a3, inc_a3 ) \
{ \
	float              ga12   = *gamma12; \
	float              si12   = *sigma12; \
	float              ga23   = *gamma23; \
	float              si23   = *sigma23; \
	scomplex* restrict alpha1 = a1; \
	scomplex* restrict alpha2 = a2; \
	scomplex* restrict alpha3 = a3; \
	scomplex           temp1; \
	scomplex           temp2; \
	scomplex           temp3; \
	int                i; \
\
	for ( i = 0; i < m_A; ++i ) \
	{ \
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		alpha1->real =  ga12 * temp1.real + si12 * temp2.real; \
		alpha1->imag =  ga12 * temp1.imag + si12 * temp2.imag; \
\
		alpha2->real = -si12 * temp1.real + ga12 * temp2.real; \
		alpha2->imag = -si12 * temp1.imag + ga12 * temp2.imag; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		alpha2->real =  ga23 * temp2.real + si23 * temp3.real; \
		alpha2->imag =  ga23 * temp2.imag + si23 * temp3.imag; \
\
		alpha3->real = -si23 * temp2.real + ga23 * temp3.real; \
		alpha3->imag = -si23 * temp2.imag + ga23 * temp3.imag; \
\
		alpha1 += inc_a1; \
		alpha2 += inc_a2; \
		alpha3 += inc_a3; \
	} \
}

#define MAC_Apply_G_mx3_opz( m_A, \
                             gamma12, \
                             sigma12, \
                             gamma23, \
                             sigma23, \
                             a1, inc_a1, \
                             a2, inc_a2, \
                             a3, inc_a3 ) \
{ \
	double             ga12   = *gamma12; \
	double             si12   = *sigma12; \
	double             ga23   = *gamma23; \
	double             si23   = *sigma23; \
	dcomplex* restrict alpha1 = a1; \
	dcomplex* restrict alpha2 = a2; \
	dcomplex* restrict alpha3 = a3; \
	dcomplex           temp1; \
	dcomplex           temp2; \
	dcomplex           temp3; \
	int                i; \
\
	for ( i = 0; i < m_A; ++i ) \
	{ \
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		alpha1->real =  ga12 * temp1.real + si12 * temp2.real; \
		alpha1->imag =  ga12 * temp1.imag + si12 * temp2.imag; \
\
		alpha2->real = -si12 * temp1.real + ga12 * temp2.real; \
		alpha2->imag = -si12 * temp1.imag + ga12 * temp2.imag; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		alpha2->real =  ga23 * temp2.real + si23 * temp3.real; \
		alpha2->imag =  ga23 * temp2.imag + si23 * temp3.imag; \
\
		alpha3->real = -si23 * temp2.real + ga23 * temp3.real; \
		alpha3->imag = -si23 * temp2.imag + ga23 * temp3.imag; \
\
		alpha1 += inc_a1; \
		alpha2 += inc_a2; \
		alpha3 += inc_a3; \
	} \
}

// end FLA_Apply_G_mx3_opt.h
// begin FLA_Apply_G_mx3b_opt.h


#define MAC_Apply_G_mx3b_ops( m_A, \
                              gamma12, \
                              sigma12, \
                              gamma23, \
                              sigma23, \
                              a1, inc_a1, \
                              a2, inc_a2, \
                              a3, inc_a3 ) \
{ \
	float              ga12   = *gamma12; \
	float              si12   = *sigma12; \
	float              ga23   = *gamma23; \
	float              si23   = *sigma23; \
	float*    restrict alpha1 = a1; \
	float*    restrict alpha2 = a2; \
	float*    restrict alpha3 = a3; \
	float              temp1; \
	float              temp2; \
	float              temp3; \
	int                i; \
\
	for ( i = 0; i < m_A; ++i ) \
	{ \
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		*alpha2 = temp2 * ga23 + temp3 * si23; \
		*alpha3 = temp3 * ga23 - temp2 * si23; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		*alpha1 = temp1 * ga12 + temp2 * si12; \
		*alpha2 = temp2 * ga12 - temp1 * si12; \
\
		alpha1 += inc_a1; \
		alpha2 += inc_a2; \
		alpha3 += inc_a3; \
	} \
}

#define MAC_Apply_G_mx3b_opc( m_A, \
                              gamma12, \
                              sigma12, \
                              gamma23, \
                              sigma23, \
                              a1, inc_a1, \
                              a2, inc_a2, \
                              a3, inc_a3 ) \
{ \
	float              ga12   = *gamma12; \
	float              si12   = *sigma12; \
	float              ga23   = *gamma23; \
	float              si23   = *sigma23; \
	scomplex* restrict alpha1 = a1; \
	scomplex* restrict alpha2 = a2; \
	scomplex* restrict alpha3 = a3; \
	scomplex           temp1; \
	scomplex           temp2; \
	scomplex           temp3; \
	int                i; \
\
	for ( i = 0; i < m_A; ++i ) \
	{ \
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		alpha2->real =  ga23 * temp2.real + si23 * temp3.real; \
		alpha2->imag =  ga23 * temp2.imag + si23 * temp3.imag; \
\
		alpha3->real = -si23 * temp2.real + ga23 * temp3.real; \
		alpha3->imag = -si23 * temp2.imag + ga23 * temp3.imag; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		alpha1->real =  ga12 * temp1.real + si12 * temp2.real; \
		alpha1->imag =  ga12 * temp1.imag + si12 * temp2.imag; \
\
		alpha2->real = -si12 * temp1.real + ga12 * temp2.real; \
		alpha2->imag = -si12 * temp1.imag + ga12 * temp2.imag; \
\
		alpha1 += inc_a1; \
		alpha2 += inc_a2; \
		alpha3 += inc_a3; \
	} \
}

#define MAC_Apply_G_mx3b_opd( m_A, \
                              gamma12, \
                              sigma12, \
                              gamma23, \
                              sigma23, \
                              a1, inc_a1, \
                              a2, inc_a2, \
                              a3, inc_a3 ) \
{ \
	double             ga12   = *gamma12; \
	double             si12   = *sigma12; \
	double             ga23   = *gamma23; \
	double             si23   = *sigma23; \
	double*   restrict alpha1 = a1; \
	double*   restrict alpha2 = a2; \
	double*   restrict alpha3 = a3; \
	double             temp1; \
	double             temp2; \
	double             temp3; \
	int                i; \
\
	for ( i = 0; i < m_A; ++i ) \
	{ \
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		*alpha2 = temp2 * ga23 + temp3 * si23; \
		*alpha3 = temp3 * ga23 - temp2 * si23; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		*alpha1 = temp1 * ga12 + temp2 * si12; \
		*alpha2 = temp2 * ga12 - temp1 * si12; \
\
		alpha1 += inc_a1; \
		alpha2 += inc_a2; \
		alpha3 += inc_a3; \
	} \
}

#define MAC_Apply_G_mx3b_opz( m_A, \
                              gamma12, \
                              sigma12, \
                              gamma23, \
                              sigma23, \
                              a1, inc_a1, \
                              a2, inc_a2, \
                              a3, inc_a3 ) \
{ \
	double             ga12   = *gamma12; \
	double             si12   = *sigma12; \
	double             ga23   = *gamma23; \
	double             si23   = *sigma23; \
	dcomplex* restrict alpha1 = a1; \
	dcomplex* restrict alpha2 = a2; \
	dcomplex* restrict alpha3 = a3; \
	dcomplex           temp1; \
	dcomplex           temp2; \
	dcomplex           temp3; \
	int                i; \
\
	for ( i = 0; i < m_A; ++i ) \
	{ \
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		alpha2->real =  ga23 * temp2.real + si23 * temp3.real; \
		alpha2->imag =  ga23 * temp2.imag + si23 * temp3.imag; \
\
		alpha3->real = -si23 * temp2.real + ga23 * temp3.real; \
		alpha3->imag = -si23 * temp2.imag + ga23 * temp3.imag; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		alpha1->real =  ga12 * temp1.real + si12 * temp2.real; \
		alpha1->imag =  ga12 * temp1.imag + si12 * temp2.imag; \
\
		alpha2->real = -si12 * temp1.real + ga12 * temp2.real; \
		alpha2->imag = -si12 * temp1.imag + ga12 * temp2.imag; \
\
		alpha1 += inc_a1; \
		alpha2 += inc_a2; \
		alpha3 += inc_a3; \
	} \
}

// end FLA_Apply_G_mx3b_opt.h
// begin FLA_Apply_G_mx3_asm.h



#if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS

#define MAC_Apply_G_mx3_ass MAC_Apply_G_mx3_ops
#define MAC_Apply_G_mx3_asd MAC_Apply_G_mx3_opd
#define MAC_Apply_G_mx3_asc MAC_Apply_G_mx3_opc
#define MAC_Apply_G_mx3_asz MAC_Apply_G_mx3_opz

#elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS

#define MAC_Apply_G_mx3_ass( m_A, \
                             gamma12, \
                             sigma12, \
                             gamma23, \
                             sigma23, \
                             a1, inc_a1, \
                             a2, inc_a2, \
                             a3, inc_a3 ) \
{\
	int              n_iter32 = m_A / ( 4 * 8 ); \
	int              n_left32 = m_A % ( 4 * 8 ); \
	int              n_iter4  = n_left32 / ( 4 * 1 ); \
	int              n_left   = n_left32 % ( 4 * 1 ); \
	int              i; \
\
	const int        step_a1 = inc_a1 * 4; \
	const int        step_a2 = inc_a1 * 4; \
	const int        step_a3 = inc_a1 * 4; \
\
	float* restrict alpha1 = a1; \
	float* restrict alpha2 = a2; \
	float* restrict alpha3 = a3; \
\
	v4sf_t    a1v, a2v, a3v; \
	v4sf_t    g12v, s12v; \
	v4sf_t    g23v, s23v; \
	v4sf_t    t1v, t2v; \
\
	g12v.v = _mm_load1_ps( gamma12 ); \
	s12v.v = _mm_load1_ps( sigma12 ); \
	g23v.v = _mm_load1_ps( gamma23 ); \
	s23v.v = _mm_load1_ps( sigma23 ); \
\
	for ( i = 0; i < n_iter32; ++i ) \
	{ \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
	} \
\
	for ( i = 0; i < n_iter4; ++i ) \
	{ \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
	} \
\
	for ( i = 0; i < n_left; ++i ) \
	{ \
		float ga12 = *gamma12; \
		float si12 = *sigma12; \
		float ga23 = *gamma23; \
		float si23 = *sigma23; \
		float temp1; \
		float temp2; \
		float temp3; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		*alpha1 = temp1 * ga12 + temp2 * si12; \
		*alpha2 = temp2 * ga12 - temp1 * si12; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		*alpha2 = temp2 * ga23 + temp3 * si23; \
		*alpha3 = temp3 * ga23 - temp2 * si23; \
\
		alpha1 += 1; \
		alpha2 += 1; \
		alpha3 += 1; \
	} \
}

#define MAC_Apply_G_mx3_asd( m_A, \
                             gamma12, \
                             sigma12, \
                             gamma23, \
                             sigma23, \
                             a1, inc_a1, \
                             a2, inc_a2, \
                             a3, inc_a3 ) \
{\
	int              n_iter16 = m_A / ( 2 * 8 ); \
	int              n_left16 = m_A % ( 2 * 8 ); \
	int              n_iter2  = n_left16 / ( 2 * 1 ); \
	int              n_left   = n_left16 % ( 2 * 1 ); \
	int              i; \
\
	const int        step_a1 = inc_a1 * 2; \
	const int        step_a2 = inc_a1 * 2; \
	const int        step_a3 = inc_a1 * 2; \
\
	double* restrict alpha1 = a1; \
	double* restrict alpha2 = a2; \
	double* restrict alpha3 = a3; \
\
	v2df_t           a1v, a2v, a3v; \
	v2df_t           g12v, s12v; \
	v2df_t           g23v, s23v; \
	v2df_t           t1v, t2v; \
\
	g12v.v = _mm_loaddup_pd( gamma12 ); \
	s12v.v = _mm_loaddup_pd( sigma12 ); \
	g23v.v = _mm_loaddup_pd( gamma23 ); \
	s23v.v = _mm_loaddup_pd( sigma23 ); \
\
	for ( i = 0; i < n_iter16; ++i ) \
	{ \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
	} \
\
	for ( i = 0; i < n_iter2; ++i ) \
	{ \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
	} \
\
	if ( n_left == 1 ) \
	{ \
		double ga12 = *gamma12; \
		double si12 = *sigma12; \
		double ga23 = *gamma23; \
		double si23 = *sigma23; \
		double temp1; \
		double temp2; \
		double temp3; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		*alpha1 = temp1 * ga12 + temp2 * si12; \
		*alpha2 = temp2 * ga12 - temp1 * si12; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		*alpha2 = temp2 * ga23 + temp3 * si23; \
		*alpha3 = temp3 * ga23 - temp2 * si23; \
	} \
}

#define MAC_Apply_G_mx3_asc( m_A, \
                             gamma12, \
                             sigma12, \
                             gamma23, \
                             sigma23, \
                             a1, inc_a1, \
                             a2, inc_a2, \
                             a3, inc_a3 ) \
{ \
	int                n_iter16 = m_A / ( 2 * 8 ); \
	int                n_left16 = m_A % ( 2 * 8 ); \
	int                n_iter2  = n_left16 / ( 2 * 1 ); \
	int                n_left   = n_left16 % ( 2 * 1 ); \
	int                i; \
\
	const int          step_a1 = inc_a1 * 2; \
	const int          step_a2 = inc_a1 * 2; \
	const int          step_a3 = inc_a1 * 2; \
\
	scomplex* restrict alpha1 = a1; \
	scomplex* restrict alpha2 = a2; \
	scomplex* restrict alpha3 = a3; \
\
	v4sf_t             a1v, a2v, a3v; \
	v4sf_t             g12v, s12v; \
	v4sf_t             g23v, s23v; \
	v4sf_t             t1v, t2v; \
\
	g12v.v = _mm_load1_ps( gamma12 ); \
	s12v.v = _mm_load1_ps( sigma12 ); \
	g23v.v = _mm_load1_ps( gamma23 ); \
	s23v.v = _mm_load1_ps( sigma23 ); \
\
	for ( i = 0; i < n_iter16; ++i ) \
	{ \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
	} \
\
	for ( i = 0; i < n_iter2; ++i ) \
	{ \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
	} \
\
	if ( n_left == 1 ) \
	{ \
		float ga12 = *gamma12; \
		float si12 = *sigma12; \
		float ga23 = *gamma23; \
		float si23 = *sigma23; \
		scomplex temp1; \
		scomplex temp2; \
		scomplex temp3; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		alpha1->real = temp1.real * ga12 + temp2.real * si12; \
		alpha2->real = temp2.real * ga12 - temp1.real * si12; \
\
		alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \
		alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		alpha2->real = temp2.real * ga23 + temp3.real * si23; \
		alpha3->real = temp3.real * ga23 - temp2.real * si23; \
\
		alpha2->imag = temp2.imag * ga23 + temp3.imag * si23; \
		alpha3->imag = temp3.imag * ga23 - temp2.imag * si23; \
	} \
}

#define MAC_Apply_G_mx3_asz( m_A, \
                             gamma12, \
                             sigma12, \
                             gamma23, \
                             sigma23, \
                             a1, inc_a1, \
                             a2, inc_a2, \
                             a3, inc_a3 ) \
{\
	int                n_iter = m_A / 8; \
	int                n_left = m_A % 8; \
	int                i; \
\
	const int          step_a1 = inc_a1 * 1; \
	const int          step_a2 = inc_a1 * 1; \
	const int          step_a3 = inc_a1 * 1; \
\
	dcomplex* restrict alpha1 = a1; \
	dcomplex* restrict alpha2 = a2; \
	dcomplex* restrict alpha3 = a3; \
\
	v2df_t             a1v, a2v, a3v; \
	v2df_t             g12v, s12v; \
	v2df_t             g23v, s23v; \
	v2df_t             t1v, t2v; \
\
	g12v.v = _mm_loaddup_pd( gamma12 ); \
	s12v.v = _mm_loaddup_pd( sigma12 ); \
	g23v.v = _mm_loaddup_pd( gamma23 ); \
	s23v.v = _mm_loaddup_pd( sigma23 ); \
\
	for ( i = 0; i < n_iter; ++i ) \
	{ \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha2 += step_a2; \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha2 += step_a2; \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha2 += step_a2; \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha2 += step_a2; \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha2 += step_a2; \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha2 += step_a2; \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha2 += step_a2; \
		alpha3 += step_a3; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha2 += step_a2; \
		alpha3 += step_a3; \
	} \
\
	for ( i = 0; i < n_left; ++i ) \
	{ \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
	} \
}

#endif
// end FLA_Apply_G_mx3_asm.h
// begin FLA_Apply_G_mx3b_asm.h



#if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS

#define MAC_Apply_G_mx3b_ass MAC_Apply_G_mx3b_ops
#define MAC_Apply_G_mx3b_asd MAC_Apply_G_mx3b_opd
#define MAC_Apply_G_mx3b_asc MAC_Apply_G_mx3b_opc
#define MAC_Apply_G_mx3b_asz MAC_Apply_G_mx3b_opz

#elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS

#define MAC_Apply_G_mx3b_ass( m_A, \
                              gamma12, \
                              sigma12, \
                              gamma23, \
                              sigma23, \
                              a1, inc_a1, \
                              a2, inc_a2, \
                              a3, inc_a3 ) \
{\
	int              n_iter32 = m_A / ( 4 * 8 ); \
	int              n_left32 = m_A % ( 4 * 8 ); \
	int              n_iter4  = n_left32 / ( 4 * 1 ); \
	int              n_left   = n_left32 % ( 4 * 1 ); \
	int              i; \
\
	const int        step_a1 = inc_a1 * 4; \
	const int        step_a2 = inc_a2 * 4; \
	const int        step_a3 = inc_a3 * 4; \
\
	float*  restrict alpha1 = a1; \
	float*  restrict alpha2 = a2; \
	float*  restrict alpha3 = a3; \
\
	v4sf_t           a1v, a2v, a3v; \
	v4sf_t           g12v, s12v; \
	v4sf_t           g23v, s23v; \
	v4sf_t           t1v, t2v; \
\
	g12v.v = _mm_load1_ps( gamma12 ); \
	s12v.v = _mm_load1_ps( sigma12 ); \
	g23v.v = _mm_load1_ps( gamma23 ); \
	s23v.v = _mm_load1_ps( sigma23 ); \
\
	for ( i = 0; i < n_iter32; ++i ) \
	{ \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
	} \
\
	for ( i = 0; i < n_iter4; ++i ) \
	{ \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
	} \
\
	for ( i = 0; i < n_left; ++i ) \
	{ \
		float ga12 = *gamma12; \
		float si12 = *sigma12; \
		float ga23 = *gamma23; \
		float si23 = *sigma23; \
		float temp1; \
		float temp2; \
		float temp3; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		*alpha2 = temp2 * ga23 + temp3 * si23; \
		*alpha3 = temp3 * ga23 - temp2 * si23; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		*alpha1 = temp1 * ga12 + temp2 * si12; \
		*alpha2 = temp2 * ga12 - temp1 * si12; \
\
		alpha1 += 1; \
		alpha2 += 1; \
		alpha3 += 1; \
	} \
}

#define MAC_Apply_G_mx3b_asd( m_A, \
                              gamma12, \
                              sigma12, \
                              gamma23, \
                              sigma23, \
                              a1, inc_a1, \
                              a2, inc_a2, \
                              a3, inc_a3 ) \
{\
	int              n_iter16 = m_A / ( 2 * 8 ); \
	int              n_left16 = m_A % ( 2 * 8 ); \
	int              n_iter2  = n_left16 / ( 2 * 1 ); \
	int              n_left   = n_left16 % ( 2 * 1 ); \
	int              i; \
\
	const int        step_a1 = inc_a1 * 2; \
	const int        step_a2 = inc_a2 * 2; \
	const int        step_a3 = inc_a3 * 2; \
\
	double* restrict alpha1 = a1; \
	double* restrict alpha2 = a2; \
	double* restrict alpha3 = a3; \
\
	v2df_t           a1v, a2v, a3v; \
	v2df_t           g12v, s12v; \
	v2df_t           g23v, s23v; \
	v2df_t           t1v, t2v; \
\
	g12v.v = _mm_loaddup_pd( gamma12 ); \
	s12v.v = _mm_loaddup_pd( sigma12 ); \
	g23v.v = _mm_loaddup_pd( gamma23 ); \
	s23v.v = _mm_loaddup_pd( sigma23 ); \
\
	for ( i = 0; i < n_iter16; ++i ) \
	{ \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
	} \
\
	for ( i = 0; i < n_iter2; ++i ) \
	{ \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
	} \
\
	if ( n_left == 1 ) \
	{ \
		double ga12 = *gamma12; \
		double si12 = *sigma12; \
		double ga23 = *gamma23; \
		double si23 = *sigma23; \
		double temp1; \
		double temp2; \
		double temp3; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		*alpha2 = temp2 * ga23 + temp3 * si23; \
		*alpha3 = temp3 * ga23 - temp2 * si23; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		*alpha1 = temp1 * ga12 + temp2 * si12; \
		*alpha2 = temp2 * ga12 - temp1 * si12; \
	} \
}

#define MAC_Apply_G_mx3b_asc( m_A, \
                              gamma12, \
                              sigma12, \
                              gamma23, \
                              sigma23, \
                              a1, inc_a1, \
                              a2, inc_a2, \
                              a3, inc_a3 ) \
{\
	int                n_iter16 = m_A / ( 2 * 8 ); \
	int                n_left16 = m_A % ( 2 * 8 ); \
	int                n_iter2  = n_left16 / ( 2 * 1 ); \
	int                n_left   = n_left16 % ( 2 * 1 ); \
	int                i; \
\
	const int          step_a1 = inc_a1 * 2; \
	const int          step_a2 = inc_a2 * 2; \
	const int          step_a3 = inc_a3 * 2; \
\
	scomplex* restrict alpha1 = a1; \
	scomplex* restrict alpha2 = a2; \
	scomplex* restrict alpha3 = a3; \
\
	v4sf_t             a1v, a2v, a3v; \
	v4sf_t             g12v, s12v; \
	v4sf_t             g23v, s23v; \
	v4sf_t             t1v, t2v; \
\
	g12v.v = _mm_load1_ps( gamma12 ); \
	s12v.v = _mm_load1_ps( sigma12 ); \
	g23v.v = _mm_load1_ps( gamma23 ); \
	s23v.v = _mm_load1_ps( sigma23 ); \
\
	for ( i = 0; i < n_iter16; ++i ) \
	{ \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
	} \
\
	for ( i = 0; i < n_iter2; ++i ) \
	{ \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
	} \
\
	if ( n_left == 1 ) \
	{ \
		float ga12 = *gamma12; \
		float si12 = *sigma12; \
		float ga23 = *gamma23; \
		float si23 = *sigma23; \
		scomplex temp1; \
		scomplex temp2; \
		scomplex temp3; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		alpha1->real = temp1.real * ga12 + temp2.real * si12; \
		alpha2->real = temp2.real * ga12 - temp1.real * si12; \
\
		alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \
		alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		alpha2->real = temp2.real * ga23 + temp3.real * si23; \
		alpha3->real = temp3.real * ga23 - temp2.real * si23; \
\
		alpha2->imag = temp2.imag * ga23 + temp3.imag * si23; \
		alpha3->imag = temp3.imag * ga23 - temp2.imag * si23; \
	} \
}

#define MAC_Apply_G_mx3b_asz( m_A, \
                              gamma12, \
                              sigma12, \
                              gamma23, \
                              sigma23, \
                              a1, inc_a1, \
                              a2, inc_a2, \
                              a3, inc_a3 ) \
{\
	int                n_iter = m_A / 8; \
	int                n_left = m_A % 8; \
	int                i; \
\
	const int          step_a1 = inc_a1 * 1; \
	const int          step_a2 = inc_a2 * 1; \
	const int          step_a3 = inc_a3 * 1; \
\
	dcomplex* restrict alpha1 = a1; \
	dcomplex* restrict alpha2 = a2; \
	dcomplex* restrict alpha3 = a3; \
\
	v2df_t             a1v, a2v, a3v; \
	v2df_t             g12v, s12v; \
	v2df_t             g23v, s23v; \
	v2df_t             t1v, t2v; \
\
	g12v.v = _mm_loaddup_pd( gamma12 ); \
	s12v.v = _mm_loaddup_pd( sigma12 ); \
	g23v.v = _mm_loaddup_pd( gamma23 ); \
	s23v.v = _mm_loaddup_pd( sigma23 ); \
\
	for ( i = 0; i < n_iter; ++i ) \
	{ \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
	} \
\
	for ( i = 0; i < n_left; ++i ) \
	{ \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
		a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
		a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
	} \
}

#endif
// end FLA_Apply_G_mx3b_asm.h

// begin FLA_Apply_G_mx4s_opt.h


#define MAC_Apply_G_mx4s_ops( m_A, \
                              gamma23_k1, \
                              sigma23_k1, \
                              gamma34_k1, \
                              sigma34_k1, \
                              gamma12_k2, \
                              sigma12_k2, \
                              gamma23_k2, \
                              sigma23_k2, \
                              a1, inc_a1, \
                              a2, inc_a2, \
                              a3, inc_a3, \
                              a4, inc_a4 ) \
{ \
	float              ga23_k1 = *gamma23_k1; \
	float              si23_k1 = *sigma23_k1; \
	float              ga34_k1 = *gamma34_k1; \
	float              si34_k1 = *sigma34_k1; \
	float              ga12_k2 = *gamma12_k2; \
	float              si12_k2 = *sigma12_k2; \
	float              ga23_k2 = *gamma23_k2; \
	float              si23_k2 = *sigma23_k2; \
	float*    restrict alpha1 = a1; \
	float*    restrict alpha2 = a2; \
	float*    restrict alpha3 = a3; \
	float*    restrict alpha4 = a4; \
	float              temp1; \
	float              temp2; \
	float              temp3; \
	float              temp4; \
	int                i; \
\
	for ( i = 0; i < m_A; ++i ) \
	{ \
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		*alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \
		*alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \
\
		temp3 = *alpha3; \
		temp4 = *alpha4; \
\
		*alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \
		*alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		*alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \
		*alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		*alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \
		*alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \
\
		alpha1 += inc_a1; \
		alpha2 += inc_a2; \
		alpha3 += inc_a3; \
		alpha4 += inc_a4; \
	} \
}

#define MAC_Apply_G_mx4s_opc( m_A, \
                              gamma23_k1, \
                              sigma23_k1, \
                              gamma34_k1, \
                              sigma34_k1, \
                              gamma12_k2, \
                              sigma12_k2, \
                              gamma23_k2, \
                              sigma23_k2, \
                              a1, inc_a1, \
                              a2, inc_a2, \
                              a3, inc_a3, \
                              a4, inc_a4 ) \
{ \
	float              ga23_k1 = *gamma23_k1; \
	float              si23_k1 = *sigma23_k1; \
	float              ga34_k1 = *gamma34_k1; \
	float              si34_k1 = *sigma34_k1; \
	float              ga12_k2 = *gamma12_k2; \
	float              si12_k2 = *sigma12_k2; \
	float              ga23_k2 = *gamma23_k2; \
	float              si23_k2 = *sigma23_k2; \
	scomplex* restrict alpha1 = a1; \
	scomplex* restrict alpha2 = a2; \
	scomplex* restrict alpha3 = a3; \
	scomplex* restrict alpha4 = a4; \
	scomplex           temp1; \
	scomplex           temp2; \
	scomplex           temp3; \
	scomplex           temp4; \
	int                i; \
\
	for ( i = 0; i < m_A; ++i ) \
	{ \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		alpha2->real = temp2.real * ga23_k1 + temp3.real * si23_k1; \
		alpha2->imag = temp2.imag * ga23_k1 + temp3.imag * si23_k1; \
\
		alpha3->real = temp3.real * ga23_k1 - temp2.real * si23_k1; \
		alpha3->imag = temp3.imag * ga23_k1 - temp2.imag * si23_k1; \
\
		temp3 = *alpha3; \
		temp4 = *alpha4; \
\
		alpha3->real = temp3.real * ga34_k1 + temp4.real * si34_k1; \
		alpha3->imag = temp3.imag * ga34_k1 + temp4.imag * si34_k1; \
\
		alpha4->real = temp4.real * ga34_k1 - temp3.real * si34_k1; \
		alpha4->imag = temp4.imag * ga34_k1 - temp3.imag * si34_k1; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		alpha1->real = temp1.real * ga12_k2 + temp2.real * si12_k2; \
		alpha1->imag = temp1.imag * ga12_k2 + temp2.imag * si12_k2; \
\
		alpha2->real = temp2.real * ga12_k2 - temp1.real * si12_k2; \
		alpha2->imag = temp2.imag * ga12_k2 - temp1.imag * si12_k2; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		alpha2->real = temp2.real * ga23_k2 + temp3.real * si23_k2; \
		alpha2->imag = temp2.imag * ga23_k2 + temp3.imag * si23_k2; \
\
		alpha3->real = temp3.real * ga23_k2 - temp2.real * si23_k2; \
		alpha3->imag = temp3.imag * ga23_k2 - temp2.imag * si23_k2; \
\
		alpha1 += inc_a1; \
		alpha2 += inc_a2; \
		alpha3 += inc_a3; \
		alpha4 += inc_a4; \
	} \
}

#define MAC_Apply_G_mx4s_opd( m_A, \
                              gamma23_k1, \
                              sigma23_k1, \
                              gamma34_k1, \
                              sigma34_k1, \
                              gamma12_k2, \
                              sigma12_k2, \
                              gamma23_k2, \
                              sigma23_k2, \
                              a1, inc_a1, \
                              a2, inc_a2, \
                              a3, inc_a3, \
                              a4, inc_a4 ) \
{ \
	double             ga23_k1 = *gamma23_k1; \
	double             si23_k1 = *sigma23_k1; \
	double             ga34_k1 = *gamma34_k1; \
	double             si34_k1 = *sigma34_k1; \
	double             ga12_k2 = *gamma12_k2; \
	double             si12_k2 = *sigma12_k2; \
	double             ga23_k2 = *gamma23_k2; \
	double             si23_k2 = *sigma23_k2; \
	double*   restrict alpha1 = a1; \
	double*   restrict alpha2 = a2; \
	double*   restrict alpha3 = a3; \
	double*   restrict alpha4 = a4; \
	double             temp1; \
	double             temp2; \
	double             temp3; \
	double             temp4; \
	int                i; \
\
	for ( i = 0; i < m_A; ++i ) \
	{ \
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		*alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \
		*alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \
\
		temp3 = *alpha3; \
		temp4 = *alpha4; \
\
		*alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \
		*alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		*alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \
		*alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		*alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \
		*alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \
\
		alpha1 += inc_a1; \
		alpha2 += inc_a2; \
		alpha3 += inc_a3; \
		alpha4 += inc_a4; \
	} \
}

#define MAC_Apply_G_mx4s_opz( m_A, \
                              gamma23_k1, \
                              sigma23_k1, \
                              gamma34_k1, \
                              sigma34_k1, \
                              gamma12_k2, \
                              sigma12_k2, \
                              gamma23_k2, \
                              sigma23_k2, \
                              a1, inc_a1, \
                              a2, inc_a2, \
                              a3, inc_a3, \
                              a4, inc_a4 ) \
{ \
	double             ga23_k1 = *gamma23_k1; \
	double             si23_k1 = *sigma23_k1; \
	double             ga34_k1 = *gamma34_k1; \
	double             si34_k1 = *sigma34_k1; \
	double             ga12_k2 = *gamma12_k2; \
	double             si12_k2 = *sigma12_k2; \
	double             ga23_k2 = *gamma23_k2; \
	double             si23_k2 = *sigma23_k2; \
	dcomplex* restrict alpha1 = a1; \
	dcomplex* restrict alpha2 = a2; \
	dcomplex* restrict alpha3 = a3; \
	dcomplex* restrict alpha4 = a4; \
	dcomplex           temp1; \
	dcomplex           temp2; \
	dcomplex           temp3; \
	dcomplex           temp4; \
	int                i; \
\
	for ( i = 0; i < m_A; ++i ) \
	{ \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		alpha2->real = temp2.real * ga23_k1 + temp3.real * si23_k1; \
		alpha2->imag = temp2.imag * ga23_k1 + temp3.imag * si23_k1; \
\
		alpha3->real = temp3.real * ga23_k1 - temp2.real * si23_k1; \
		alpha3->imag = temp3.imag * ga23_k1 - temp2.imag * si23_k1; \
\
		temp3 = *alpha3; \
		temp4 = *alpha4; \
\
		alpha3->real = temp3.real * ga34_k1 + temp4.real * si34_k1; \
		alpha3->imag = temp3.imag * ga34_k1 + temp4.imag * si34_k1; \
\
		alpha4->real = temp4.real * ga34_k1 - temp3.real * si34_k1; \
		alpha4->imag = temp4.imag * ga34_k1 - temp3.imag * si34_k1; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		alpha1->real = temp1.real * ga12_k2 + temp2.real * si12_k2; \
		alpha1->imag = temp1.imag * ga12_k2 + temp2.imag * si12_k2; \
\
		alpha2->real = temp2.real * ga12_k2 - temp1.real * si12_k2; \
		alpha2->imag = temp2.imag * ga12_k2 - temp1.imag * si12_k2; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		alpha2->real = temp2.real * ga23_k2 + temp3.real * si23_k2; \
		alpha2->imag = temp2.imag * ga23_k2 + temp3.imag * si23_k2; \
\
		alpha3->real = temp3.real * ga23_k2 - temp2.real * si23_k2; \
		alpha3->imag = temp3.imag * ga23_k2 - temp2.imag * si23_k2; \
\
		alpha1 += inc_a1; \
		alpha2 += inc_a2; \
		alpha3 += inc_a3; \
		alpha4 += inc_a4; \
	} \
}

// end FLA_Apply_G_mx4s_opt.h
// begin FLA_Apply_G_mx4s_asm.h



#if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS

#define MAC_Apply_G_mx4s_ass MAC_Apply_G_mx4s_ops
#define MAC_Apply_G_mx4s_asd MAC_Apply_G_mx4s_opd
#define MAC_Apply_G_mx4s_asc MAC_Apply_G_mx4s_opc
#define MAC_Apply_G_mx4s_asz MAC_Apply_G_mx4s_opz

#elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS

#define MAC_Apply_G_mx4s_ass( m_A, \
                              gamma23_k1, \
                              sigma23_k1, \
                              gamma34_k1, \
                              sigma34_k1, \
                              gamma12_k2, \
                              sigma12_k2, \
                              gamma23_k2, \
                              sigma23_k2, \
                              a1, inc_a1, \
                              a2, inc_a2, \
                              a3, inc_a3, \
                              a4, inc_a4 ) \
{\
	int                n_iter32 = m_A / ( 4 * 8 ); \
	int                n_left32 = m_A % ( 4 * 8 ); \
	int                n_iter4  = n_left32 / ( 4 * 1 ); \
	int                n_left   = n_left32 % ( 4 * 1 ); \
	int                i; \
\
	const int          step_a1 = inc_a1 * 4; \
	const int          step_a2 = inc_a2 * 4; \
	const int          step_a3 = inc_a3 * 4; \
	const int          step_a4 = inc_a4 * 4; \
\
	float*    restrict alpha1 = a1; \
	float*    restrict alpha2 = a2; \
	float*    restrict alpha3 = a3; \
	float*    restrict alpha4 = a4; \
\
	v4sf_t             a1v, a2v, a3v, a4v; \
	v4sf_t             b1v, b2v, b3v, b4v; \
	v4sf_t             g23_k1v, s23_k1v; \
	v4sf_t             g34_k1v, s34_k1v; \
	v4sf_t             g12_k2v, s12_k2v; \
	v4sf_t             g23_k2v, s23_k2v; \
	v4sf_t             t1v, t2v, t3v; \
\
	g23_k1v.v = _mm_load1_ps( gamma23_k1 ); \
	s23_k1v.v = _mm_load1_ps( sigma23_k1 ); \
	g34_k1v.v = _mm_load1_ps( gamma34_k1 ); \
	s34_k1v.v = _mm_load1_ps( sigma34_k1 ); \
	g12_k2v.v = _mm_load1_ps( gamma12_k2 ); \
	s12_k2v.v = _mm_load1_ps( sigma12_k2 ); \
	g23_k2v.v = _mm_load1_ps( gamma23_k2 ); \
	s23_k2v.v = _mm_load1_ps( sigma23_k2 ); \
\
	for ( i = 0; i < n_iter32; ++i ) \
	{ \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		a4v.v = _mm_load_ps( ( float* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_ps( ( float* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
\
 \
\
		b4v.v = _mm_load_ps( ( float* )alpha4 ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
		b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		b1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t3v.v = b3v.v; \
		b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
		b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_ps( ( float* )alpha4, b4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = b1v.v; \
		b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
		b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_ps( ( float* )alpha1, b1v.v ); \
		alpha1 += step_a1; \
		a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
		b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_ps( ( float* )alpha2, b2v.v ); \
		alpha2 += step_a2; \
		a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
\
 \
\
		a4v.v = _mm_load_ps( ( float* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_ps( ( float* )alpha3, b3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_ps( ( float* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
\
 \
\
		b4v.v = _mm_load_ps( ( float* )alpha4 ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
		b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		b1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t3v.v = b3v.v; \
		b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
		b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_ps( ( float* )alpha4, b4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = b1v.v; \
		b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
		b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_ps( ( float* )alpha1, b1v.v ); \
		alpha1 += step_a1; \
		a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a3) ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
		b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_ps( ( float* )alpha2, b2v.v ); \
		alpha2 += step_a2; \
		a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
\
\
 \
\
		a4v.v = _mm_load_ps( ( float* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_ps( ( float* )alpha3, b3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_ps( ( float* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
\
 \
\
		b4v.v = _mm_load_ps( ( float* )alpha4 ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
		b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		b1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t3v.v = b3v.v; \
		b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
		b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_ps( ( float* )alpha4, b4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = b1v.v; \
		b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
		b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_ps( ( float* )alpha1, b1v.v ); \
		alpha1 += step_a1; \
		a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
		b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_ps( ( float* )alpha2, b2v.v ); \
		alpha2 += step_a2; \
		a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
\
 \
\
		a4v.v = _mm_load_ps( ( float* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_ps( ( float* )alpha3, b3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_ps( ( float* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
\
 \
\
		b4v.v = _mm_load_ps( ( float* )alpha4 ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
		b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		b1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t3v.v = b3v.v; \
		b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
		b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_ps( ( float* )alpha4, b4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = b1v.v; \
		b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
		b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_ps( ( float* )alpha1, b1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
		b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_ps( ( float* )alpha2, b2v.v ); \
		alpha2 += step_a2; \
\
		_mm_store_ps( ( float* )alpha3, b3v.v ); \
		alpha3 += step_a3; \
\
 \
	} \
\
	for ( i = 0; i < n_iter4; ++i ) \
	{ \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		a4v.v = _mm_load_ps( ( float* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_ps( ( float* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
	} \
\
	for ( i = 0; i < n_left; ++i ) \
	{ \
		float              ga23_k1 = *gamma23_k1; \
		float              si23_k1 = *sigma23_k1; \
		float              ga34_k1 = *gamma34_k1; \
		float              si34_k1 = *sigma34_k1; \
		float              ga12_k2 = *gamma12_k2; \
		float              si12_k2 = *sigma12_k2; \
		float              ga23_k2 = *gamma23_k2; \
		float              si23_k2 = *sigma23_k2; \
		float              temp1; \
		float              temp2; \
		float              temp3; \
		float              temp4; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		*alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \
		*alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \
\
		temp3 = *alpha3; \
		temp4 = *alpha4; \
\
		*alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \
		*alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		*alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \
		*alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		*alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \
		*alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \
\
		alpha1 += 1; \
		alpha2 += 1; \
		alpha3 += 1; \
		alpha4 += 1; \
	} \
}

#define MAC_Apply_G_mx4s_asd( m_A, \
                              gamma23_k1, \
                              sigma23_k1, \
                              gamma34_k1, \
                              sigma34_k1, \
                              gamma12_k2, \
                              sigma12_k2, \
                              gamma23_k2, \
                              sigma23_k2, \
                              a1, inc_a1, \
                              a2, inc_a2, \
                              a3, inc_a3, \
                              a4, inc_a4 ) \
{\
	int                n_iter16 = m_A / ( 2 * 8 ); \
	int                n_left16 = m_A % ( 2 * 8 ); \
	int                n_iter2  = n_left16 / ( 2 * 1 ); \
	int                n_left   = n_left16 % ( 2 * 1 ); \
	int                i; \
\
	const int          step_a1 = inc_a1 * 2; \
	const int          step_a2 = inc_a2 * 2; \
	const int          step_a3 = inc_a3 * 2; \
	const int          step_a4 = inc_a4 * 2; \
\
	double*   restrict alpha1 = a1; \
	double*   restrict alpha2 = a2; \
	double*   restrict alpha3 = a3; \
	double*   restrict alpha4 = a4; \
\
	v2df_t             a1v, a2v, a3v, a4v; \
	v2df_t             b1v, b2v, b3v, b4v; \
	v2df_t             g23_k1v, s23_k1v; \
	v2df_t             g34_k1v, s34_k1v; \
	v2df_t             g12_k2v, s12_k2v; \
	v2df_t             g23_k2v, s23_k2v; \
	v2df_t             t1v, t2v, t3v; \
\
	g23_k1v.v = _mm_loaddup_pd( gamma23_k1 ); \
	s23_k1v.v = _mm_loaddup_pd( sigma23_k1 ); \
	g34_k1v.v = _mm_loaddup_pd( gamma34_k1 ); \
	s34_k1v.v = _mm_loaddup_pd( sigma34_k1 ); \
	g12_k2v.v = _mm_loaddup_pd( gamma12_k2 ); \
	s12_k2v.v = _mm_loaddup_pd( sigma12_k2 ); \
	g23_k2v.v = _mm_loaddup_pd( gamma23_k2 ); \
	s23_k2v.v = _mm_loaddup_pd( sigma23_k2 ); \
\
	for ( i = 0; i < n_iter16; ++i ) \
	{ \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		a4v.v = _mm_load_pd( ( double* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_pd( ( double* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
\
 \
\
		b4v.v = _mm_load_pd( ( double* )alpha4 ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
		b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		b1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t3v.v = b3v.v; \
		b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
		b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_pd( ( double* )alpha4, b4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = b1v.v; \
		b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
		b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_pd( ( double* )alpha1, b1v.v ); \
		alpha1 += step_a1; \
		a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
		b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_pd( ( double* )alpha2, b2v.v ); \
		alpha2 += step_a2; \
		a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
\
 \
\
		a4v.v = _mm_load_pd( ( double* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_pd( ( double* )alpha3, b3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_pd( ( double* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
\
 \
\
		b4v.v = _mm_load_pd( ( double* )alpha4 ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
		b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		b1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t3v.v = b3v.v; \
		b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
		b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_pd( ( double* )alpha4, b4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = b1v.v; \
		b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
		b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_pd( ( double* )alpha1, b1v.v ); \
		alpha1 += step_a1; \
		a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a3) ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
		b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_pd( ( double* )alpha2, b2v.v ); \
		alpha2 += step_a2; \
		a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
\
\
 \
\
		a4v.v = _mm_load_pd( ( double* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_pd( ( double* )alpha3, b3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_pd( ( double* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
\
 \
\
		b4v.v = _mm_load_pd( ( double* )alpha4 ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
		b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		b1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t3v.v = b3v.v; \
		b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
		b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_pd( ( double* )alpha4, b4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = b1v.v; \
		b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
		b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_pd( ( double* )alpha1, b1v.v ); \
		alpha1 += step_a1; \
		a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
		b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_pd( ( double* )alpha2, b2v.v ); \
		alpha2 += step_a2; \
		a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
\
 \
\
		a4v.v = _mm_load_pd( ( double* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_pd( ( double* )alpha3, b3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_pd( ( double* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
\
 \
\
		b4v.v = _mm_load_pd( ( double* )alpha4 ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
		b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		b1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t3v.v = b3v.v; \
		b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
		b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_pd( ( double* )alpha4, b4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = b1v.v; \
		b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
		b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_pd( ( double* )alpha1, b1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
		b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_pd( ( double* )alpha2, b2v.v ); \
		alpha2 += step_a2; \
\
		_mm_store_pd( ( double* )alpha3, b3v.v ); \
		alpha3 += step_a3; \
\
 \
	} \
\
	for ( i = 0; i < n_iter2; ++i ) \
	{ \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		a4v.v = _mm_load_pd( ( double* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_pd( ( double* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
	} \
\
	if ( n_left == 1 ) \
	{ \
		double             ga23_k1 = *gamma23_k1; \
		double             si23_k1 = *sigma23_k1; \
		double             ga34_k1 = *gamma34_k1; \
		double             si34_k1 = *sigma34_k1; \
		double             ga12_k2 = *gamma12_k2; \
		double             si12_k2 = *sigma12_k2; \
		double             ga23_k2 = *gamma23_k2; \
		double             si23_k2 = *sigma23_k2; \
		double             temp1; \
		double             temp2; \
		double             temp3; \
		double             temp4; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		*alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \
		*alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \
\
		temp3 = *alpha3; \
		temp4 = *alpha4; \
\
		*alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \
		*alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		*alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \
		*alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		*alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \
		*alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \
\
	} \
}

#define MAC_Apply_G_mx4s_asc( m_A, \
                              gamma23_k1, \
                              sigma23_k1, \
                              gamma34_k1, \
                              sigma34_k1, \
                              gamma12_k2, \
                              sigma12_k2, \
                              gamma23_k2, \
                              sigma23_k2, \
                              a1, inc_a1, \
                              a2, inc_a2, \
                              a3, inc_a3, \
                              a4, inc_a4 ) \
{\
	int                n_iter16 = m_A / ( 2 * 8 ); \
	int                n_left16 = m_A % ( 2 * 8 ); \
	int                n_iter2  = n_left16 / ( 2 * 1 ); \
	int                n_left   = n_left16 % ( 2 * 1 ); \
	int                i; \
\
	const int          step_a1 = inc_a1 * 2; \
	const int          step_a2 = inc_a2 * 2; \
	const int          step_a3 = inc_a3 * 2; \
	const int          step_a4 = inc_a4 * 2; \
\
	scomplex* restrict alpha1 = a1; \
	scomplex* restrict alpha2 = a2; \
	scomplex* restrict alpha3 = a3; \
	scomplex* restrict alpha4 = a4; \
\
	v4sf_t             a1v, a2v, a3v, a4v; \
	v4sf_t             b1v, b2v, b3v, b4v; \
	v4sf_t             g23_k1v, s23_k1v; \
	v4sf_t             g34_k1v, s34_k1v; \
	v4sf_t             g12_k2v, s12_k2v; \
	v4sf_t             g23_k2v, s23_k2v; \
	v4sf_t             t1v, t2v, t3v; \
\
	g23_k1v.v = _mm_load1_ps( gamma23_k1 ); \
	s23_k1v.v = _mm_load1_ps( sigma23_k1 ); \
	g34_k1v.v = _mm_load1_ps( gamma34_k1 ); \
	s34_k1v.v = _mm_load1_ps( sigma34_k1 ); \
	g12_k2v.v = _mm_load1_ps( gamma12_k2 ); \
	s12_k2v.v = _mm_load1_ps( sigma12_k2 ); \
	g23_k2v.v = _mm_load1_ps( gamma23_k2 ); \
	s23_k2v.v = _mm_load1_ps( sigma23_k2 ); \
\
	for ( i = 0; i < n_iter16; ++i ) \
	{ \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		a4v.v = _mm_load_ps( ( float* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_ps( ( float* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
\
 \
\
		b4v.v = _mm_load_ps( ( float* )alpha4 ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
		b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		b1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t3v.v = b3v.v; \
		b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
		b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_ps( ( float* )alpha4, b4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = b1v.v; \
		b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
		b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_ps( ( float* )alpha1, b1v.v ); \
		alpha1 += step_a1; \
		a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
		b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_ps( ( float* )alpha2, b2v.v ); \
		alpha2 += step_a2; \
		a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
\
 \
\
		a4v.v = _mm_load_ps( ( float* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_ps( ( float* )alpha3, b3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_ps( ( float* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
\
 \
\
		b4v.v = _mm_load_ps( ( float* )alpha4 ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
		b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		b1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t3v.v = b3v.v; \
		b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
		b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_ps( ( float* )alpha4, b4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = b1v.v; \
		b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
		b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_ps( ( float* )alpha1, b1v.v ); \
		alpha1 += step_a1; \
		a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a3) ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
		b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_ps( ( float* )alpha2, b2v.v ); \
		alpha2 += step_a2; \
		a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
\
\
 \
\
		a4v.v = _mm_load_ps( ( float* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_ps( ( float* )alpha3, b3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_ps( ( float* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
\
 \
\
		b4v.v = _mm_load_ps( ( float* )alpha4 ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
		b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		b1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t3v.v = b3v.v; \
		b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
		b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_ps( ( float* )alpha4, b4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = b1v.v; \
		b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
		b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_ps( ( float* )alpha1, b1v.v ); \
		alpha1 += step_a1; \
		a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
		b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_ps( ( float* )alpha2, b2v.v ); \
		alpha2 += step_a2; \
		a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
\
 \
\
		a4v.v = _mm_load_ps( ( float* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_ps( ( float* )alpha3, b3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_ps( ( float* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
\
 \
\
		b4v.v = _mm_load_ps( ( float* )alpha4 ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
		b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		b1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t3v.v = b3v.v; \
		b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
		b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_ps( ( float* )alpha4, b4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = b1v.v; \
		b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
		b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_ps( ( float* )alpha1, b1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
		b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_ps( ( float* )alpha2, b2v.v ); \
		alpha2 += step_a2; \
\
		_mm_store_ps( ( float* )alpha3, b3v.v ); \
		alpha3 += step_a3; \
\
 \
	} \
\
	for ( i = 0; i < n_iter2; ++i ) \
	{ \
\
		a2v.v = _mm_load_ps( ( float* )alpha2 ); \
		a3v.v = _mm_load_ps( ( float* )alpha3 ); \
		a4v.v = _mm_load_ps( ( float* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		a1v.v = _mm_load_ps( ( float* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_ps( ( float* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_ps( ( float* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_ps( ( float* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_ps( ( float* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
	} \
\
	if ( n_left == 1 ) \
	{ \
		float             ga23_k1 = *gamma23_k1; \
		float             si23_k1 = *sigma23_k1; \
		float             ga34_k1 = *gamma34_k1; \
		float             si34_k1 = *sigma34_k1; \
		float             ga12_k2 = *gamma12_k2; \
		float             si12_k2 = *sigma12_k2; \
		float             ga23_k2 = *gamma23_k2; \
		float             si23_k2 = *sigma23_k2; \
		scomplex          temp1; \
		scomplex          temp2; \
		scomplex          temp3; \
		scomplex          temp4; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		alpha2->real = temp2.real * ga23_k1 + temp3.real * si23_k1; \
		alpha3->real = temp3.real * ga23_k1 - temp2.real * si23_k1; \
\
		alpha2->imag = temp2.imag * ga23_k1 + temp3.imag * si23_k1; \
		alpha3->imag = temp3.imag * ga23_k1 - temp2.imag * si23_k1; \
\
		temp3 = *alpha3; \
		temp4 = *alpha4; \
\
		alpha3->real = temp3.real * ga34_k1 + temp4.real * si34_k1; \
		alpha4->real = temp4.real * ga34_k1 - temp3.real * si34_k1; \
\
		alpha3->imag = temp3.imag * ga34_k1 + temp4.imag * si34_k1; \
		alpha4->imag = temp4.imag * ga34_k1 - temp3.imag * si34_k1; \
\
		temp1 = *alpha1; \
		temp2 = *alpha2; \
\
		alpha1->real = temp1.real * ga12_k2 + temp2.real * si12_k2; \
		alpha2->real = temp2.real * ga12_k2 - temp1.real * si12_k2; \
\
		alpha1->imag = temp1.imag * ga12_k2 + temp2.imag * si12_k2; \
		alpha2->imag = temp2.imag * ga12_k2 - temp1.imag * si12_k2; \
\
		temp2 = *alpha2; \
		temp3 = *alpha3; \
\
		alpha2->real = temp2.real * ga23_k2 + temp3.real * si23_k2; \
		alpha3->real = temp3.real * ga23_k2 - temp2.real * si23_k2; \
\
		alpha2->imag = temp2.imag * ga23_k2 + temp3.imag * si23_k2; \
		alpha3->imag = temp3.imag * ga23_k2 - temp2.imag * si23_k2; \
\
	} \
}

#define MAC_Apply_G_mx4s_asz( m_A, \
                              gamma23_k1, \
                              sigma23_k1, \
                              gamma34_k1, \
                              sigma34_k1, \
                              gamma12_k2, \
                              sigma12_k2, \
                              gamma23_k2, \
                              sigma23_k2, \
                              a1, inc_a1, \
                              a2, inc_a2, \
                              a3, inc_a3, \
                              a4, inc_a4 ) \
{\
	int                n_iter = m_A / 8; \
	int                n_left = m_A % 8; \
	int                i; \
\
	const int          step_a1 = inc_a1 * 1; \
	const int          step_a2 = inc_a2 * 1; \
	const int          step_a3 = inc_a3 * 1; \
	const int          step_a4 = inc_a4 * 1; \
\
	dcomplex* restrict alpha1 = a1; \
	dcomplex* restrict alpha2 = a2; \
	dcomplex* restrict alpha3 = a3; \
	dcomplex* restrict alpha4 = a4; \
\
	v2df_t             a1v, a2v, a3v, a4v; \
	v2df_t             b1v, b2v, b3v, b4v; \
	v2df_t             g23_k1v, s23_k1v; \
	v2df_t             g34_k1v, s34_k1v; \
	v2df_t             g12_k2v, s12_k2v; \
	v2df_t             g23_k2v, s23_k2v; \
	v2df_t             t1v, t2v, t3v; \
\
	g23_k1v.v = _mm_loaddup_pd( gamma23_k1 ); \
	s23_k1v.v = _mm_loaddup_pd( sigma23_k1 ); \
	g34_k1v.v = _mm_loaddup_pd( gamma34_k1 ); \
	s34_k1v.v = _mm_loaddup_pd( sigma34_k1 ); \
	g12_k2v.v = _mm_loaddup_pd( gamma12_k2 ); \
	s12_k2v.v = _mm_loaddup_pd( sigma12_k2 ); \
	g23_k2v.v = _mm_loaddup_pd( gamma23_k2 ); \
	s23_k2v.v = _mm_loaddup_pd( sigma23_k2 ); \
\
	for ( i = 0; i < n_iter; ++i ) \
	{ \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		a4v.v = _mm_load_pd( ( double* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_pd( ( double* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
\
 \
\
		b4v.v = _mm_load_pd( ( double* )alpha4 ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
		b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		b1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t3v.v = b3v.v; \
		b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
		b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_pd( ( double* )alpha4, b4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = b1v.v; \
		b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
		b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_pd( ( double* )alpha1, b1v.v ); \
		alpha1 += step_a1; \
		a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
		b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_pd( ( double* )alpha2, b2v.v ); \
		alpha2 += step_a2; \
		a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
\
 \
\
		a4v.v = _mm_load_pd( ( double* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_pd( ( double* )alpha3, b3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_pd( ( double* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
\
 \
\
		b4v.v = _mm_load_pd( ( double* )alpha4 ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
		b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		b1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t3v.v = b3v.v; \
		b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
		b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_pd( ( double* )alpha4, b4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = b1v.v; \
		b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
		b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_pd( ( double* )alpha1, b1v.v ); \
		alpha1 += step_a1; \
		a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a3) ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
		b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_pd( ( double* )alpha2, b2v.v ); \
		alpha2 += step_a2; \
		a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
\
 \
\
		a4v.v = _mm_load_pd( ( double* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_pd( ( double* )alpha3, b3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_pd( ( double* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
\
 \
\
		b4v.v = _mm_load_pd( ( double* )alpha4 ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
		b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		b1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t3v.v = b3v.v; \
		b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
		b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_pd( ( double* )alpha4, b4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = b1v.v; \
		b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
		b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_pd( ( double* )alpha1, b1v.v ); \
		alpha1 += step_a1; \
		a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
		b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_pd( ( double* )alpha2, b2v.v ); \
		alpha2 += step_a2; \
		a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
\
 \
\
		a4v.v = _mm_load_pd( ( double* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_pd( ( double* )alpha3, b3v.v ); \
		alpha3 += step_a3; \
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_pd( ( double* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
		b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
\
 \
\
		b4v.v = _mm_load_pd( ( double* )alpha4 ); \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
		b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
		b1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t3v.v = b3v.v; \
		b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
		b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_pd( ( double* )alpha4, b4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = b1v.v; \
		b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
		b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_pd( ( double* )alpha1, b1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = b2v.v; \
		b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
		b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_pd( ( double* )alpha2, b2v.v ); \
		alpha2 += step_a2; \
\
		_mm_store_pd( ( double* )alpha3, b3v.v ); \
		alpha3 += step_a3; \
\
 \
	} \
\
	for ( i = 0; i < n_left; ++i ) \
	{ \
\
		a2v.v = _mm_load_pd( ( double* )alpha2 ); \
		a3v.v = _mm_load_pd( ( double* )alpha3 ); \
		a4v.v = _mm_load_pd( ( double* )alpha4 ); \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
		a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
\
		a1v.v = _mm_load_pd( ( double* )alpha1 ); \
\
		t3v.v = a3v.v; \
		a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
		a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
\
		_mm_store_pd( ( double* )alpha4, a4v.v ); \
		alpha4 += step_a4; \
\
		t1v.v = a1v.v; \
		a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
		a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
\
		_mm_store_pd( ( double* )alpha1, a1v.v ); \
		alpha1 += step_a1; \
\
		t2v.v = a2v.v; \
		a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
		a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
\
		_mm_store_pd( ( double* )alpha2, a2v.v ); \
		alpha2 += step_a2; \
		_mm_store_pd( ( double* )alpha3, a3v.v ); \
		alpha3 += step_a3; \
	} \
}

#endif
// end FLA_Apply_G_mx4s_asm.h

// end FLA_Apply_G.h
// begin FLA_Apply_H2_UT.h


// begin FLA_Apply_H2_UT_l.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Apply_H2_UT_l_unb_var1( FLA_Obj tau, FLA_Obj u2, FLA_Obj a1t,
                                                               FLA_Obj A2 );

FLA_Error FLA_Apply_H2_UT_l_opt_var1( FLA_Obj tau, FLA_Obj u2, FLA_Obj a1t,
                                                               FLA_Obj A2 );

FLA_Error FLA_Apply_H2_UT_l_ops_var1( int m_u2_A2,
                                      int n_a1t,
                                      float* tau,
                                      float* u2, int inc_u2,
                                      float* a1t, int inc_a1t,
                                      float* A2, int rs_A2, int cs_A2 );

FLA_Error FLA_Apply_H2_UT_l_opd_var1( int m_u2_A2,
                                      int n_a1t,
                                      double* tau,
                                      double* u2, int inc_u2,
                                      double* a1t, int inc_a1t,
                                      double* A2, int rs_A2, int cs_A2 );

FLA_Error FLA_Apply_H2_UT_l_opc_var1( int m_u2_A2,
                                      int n_a1t,
                                      scomplex* tau,
                                      scomplex* u2, int inc_u2,
                                      scomplex* a1t, int inc_a1t,
                                      scomplex* A2, int rs_A2, int cs_A2 );

FLA_Error FLA_Apply_H2_UT_l_opz_var1( int m_u2_A2,
                                      int n_a1t,
                                      dcomplex* tau,
                                      dcomplex* u2, int inc_u2,
                                      dcomplex* a1t, int inc_a1t,
                                      dcomplex* A2, int rs_A2, int cs_A2 );

// end FLA_Apply_H2_UT_l.h
// begin FLA_Apply_H2_UT_r.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Apply_H2_UT_r_unb_var1( FLA_Obj tau, FLA_Obj u2h,
                                      FLA_Obj a1, FLA_Obj A2 );

FLA_Error FLA_Apply_H2_UT_r_opt_var1( FLA_Obj tau, FLA_Obj u2h,
                                      FLA_Obj a1, FLA_Obj A2 );

FLA_Error FLA_Apply_H2_UT_r_ops_var1( int n_u2h_A2,
                                      int m_a1,
                                      float* tau,
                                      float* u2h, int inc_u2h,
                                      float* a1, int inc_a1,
                                      float* A2, int rs_A2, int cs_A2 );

FLA_Error FLA_Apply_H2_UT_r_opd_var1( int n_u2h_A2,
                                      int m_a1,
                                      double* tau,
                                      double* u2h, int inc_u2h,
                                      double* a1, int inc_a1,
                                      double* A2, int rs_A2, int cs_A2 );

FLA_Error FLA_Apply_H2_UT_r_opc_var1( int n_u2h_A2,
                                      int m_a1,
                                      scomplex* tau,
                                      scomplex* u2h, int inc_u2h,
                                      scomplex* a1, int inc_a1,
                                      scomplex* A2, int rs_A2, int cs_A2 );

FLA_Error FLA_Apply_H2_UT_r_opz_var1( int n_u2h_A2,
                                      int m_a1,
                                      dcomplex* tau,
                                      dcomplex* u2h, int inc_u2h,
                                      dcomplex* a1, int inc_a1,
                                      dcomplex* A2, int rs_A2, int cs_A2 );

// end FLA_Apply_H2_UT_r.h

FLA_Error FLA_Apply_H2_UT_internal( FLA_Side side, FLA_Obj tau, FLA_Obj u2, FLA_Obj a1, FLA_Obj A2 );

// end FLA_Apply_H2_UT.h
// begin FLA_Apply_HUD_UT.h


// begin FLA_Apply_HUD_UT_l.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Apply_HUD_UT_l_unb_var1( FLA_Obj tau, FLA_Obj w12t,
                                                    FLA_Obj r12t,
                                       FLA_Obj u1,  FLA_Obj C2,
                                       FLA_Obj v1,  FLA_Obj D2 );

FLA_Error FLA_Apply_HUD_UT_l_opt_var1( FLA_Obj tau, FLA_Obj w12t,
                                                    FLA_Obj r12t,
                                       FLA_Obj u1,  FLA_Obj C2,
                                       FLA_Obj v1,  FLA_Obj D2 );

FLA_Error FLA_Apply_HUD_UT_l_ops_var1( int m_u1_C2,
                                       int m_v1_D2,
                                       int n_r12t,
                                       float* tau,
                                       float* w12t, int inc_w12t,
                                       float* r12t, int inc_r12t,
                                       float* u1, int inc_u1,
                                       float* C2, int rs_C2, int cs_C2,
                                       float* v1, int inc_v1,
                                       float* D2, int rs_D2, int cs_D2 );

FLA_Error FLA_Apply_HUD_UT_l_opd_var1( int m_u1_C2,
                                       int m_v1_D2,
                                       int n_r12t,
                                       double* tau,
                                       double* w12t, int inc_w12t,
                                       double* r12t, int inc_r12t,
                                       double* u1, int inc_u1,
                                       double* C2, int rs_C2, int cs_C2,
                                       double* v1, int inc_v1,
                                       double* D2, int rs_D2, int cs_D2 );

FLA_Error FLA_Apply_HUD_UT_l_opc_var1( int m_u1_C2,
                                       int m_v1_D2,
                                       int n_r12t,
                                       scomplex* tau,
                                       scomplex* w12t, int inc_w12t,
                                       scomplex* r12t, int inc_r12t,
                                       scomplex* u1, int inc_u1,
                                       scomplex* C2, int rs_C2, int cs_C2,
                                       scomplex* v1, int inc_v1,
                                       scomplex* D2, int rs_D2, int cs_D2 );

FLA_Error FLA_Apply_HUD_UT_l_opz_var1( int m_u1_C2,
                                       int m_v1_D2,
                                       int n_r12t,
                                       dcomplex* tau,
                                       dcomplex* w12t, int inc_w12t,
                                       dcomplex* r12t, int inc_r12t,
                                       dcomplex* u1, int inc_u1,
                                       dcomplex* C2, int rs_C2, int cs_C2,
                                       dcomplex* v1, int inc_v1,
                                       dcomplex* D2, int rs_D2, int cs_D2 );

// end FLA_Apply_HUD_UT_l.h
//#include "FLA_Apply_HUD_UT_r.h"

FLA_Error FLA_Apply_HUD_UT_internal( FLA_Side side,
                                     FLA_Obj tau, FLA_Obj w12t,
                                                  FLA_Obj r12t,
                                     FLA_Obj u1,  FLA_Obj C2,
                                     FLA_Obj v1,  FLA_Obj D2 );

// end FLA_Apply_HUD_UT.h
// begin FLA_Apply_Q_UT.h


// begin FLA_Apply_Q_UT_lnfc.h


FLA_Error FLA_Apply_Q_UT_lnfc_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lnfc_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lnfc_blk_var3( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );

// end FLA_Apply_Q_UT_lnfc.h
// begin FLA_Apply_Q_UT_lnfr.h


FLA_Error FLA_Apply_Q_UT_lnfr_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lnfr_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lnfr_blk_var3( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );

// end FLA_Apply_Q_UT_lnfr.h
// begin FLA_Apply_Q_UT_lnbc.h


FLA_Error FLA_Apply_Q_UT_lnbc_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lnbc_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lnbc_blk_var3( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );

// end FLA_Apply_Q_UT_lnbc.h
// begin FLA_Apply_Q_UT_lnbr.h


FLA_Error FLA_Apply_Q_UT_lnbr_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lnbr_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lnbr_blk_var3( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );

// end FLA_Apply_Q_UT_lnbr.h
// begin FLA_Apply_Q_UT_lhfc.h


FLA_Error FLA_Apply_Q_UT_lhfc_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lhfc_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lhfc_blk_var3( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );

// end FLA_Apply_Q_UT_lhfc.h
// begin FLA_Apply_Q_UT_lhfr.h


FLA_Error FLA_Apply_Q_UT_lhfr_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lhfr_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lhfr_blk_var3( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );

// end FLA_Apply_Q_UT_lhfr.h
// begin FLA_Apply_Q_UT_lhbc.h


FLA_Error FLA_Apply_Q_UT_lhbc_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lhbc_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lhbc_blk_var3( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );

// end FLA_Apply_Q_UT_lhbc.h
// begin FLA_Apply_Q_UT_lhbr.h


FLA_Error FLA_Apply_Q_UT_lhbr_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lhbr_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lhbr_blk_var3( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );

// end FLA_Apply_Q_UT_lhbr.h

// begin FLA_Apply_Q_UT_rhbc.h


FLA_Error FLA_Apply_Q_UT_rhbc_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rhbc_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rhbc_blk_var3( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );

// end FLA_Apply_Q_UT_rhbc.h
// begin FLA_Apply_Q_UT_rhbr.h


FLA_Error FLA_Apply_Q_UT_rhbr_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rhbr_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rhbr_blk_var3( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );

// end FLA_Apply_Q_UT_rhbr.h
// begin FLA_Apply_Q_UT_rhfc.h


FLA_Error FLA_Apply_Q_UT_rhfc_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rhfc_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rhfc_blk_var3( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );

// end FLA_Apply_Q_UT_rhfc.h
// begin FLA_Apply_Q_UT_rhfr.h


FLA_Error FLA_Apply_Q_UT_rhfr_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rhfr_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rhfr_blk_var3( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );

// end FLA_Apply_Q_UT_rhfr.h
// begin FLA_Apply_Q_UT_rnbc.h


FLA_Error FLA_Apply_Q_UT_rnbc_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rnbc_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rnbc_blk_var3( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );

// end FLA_Apply_Q_UT_rnbc.h
// begin FLA_Apply_Q_UT_rnbr.h


FLA_Error FLA_Apply_Q_UT_rnbr_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rnbr_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rnbr_blk_var3( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );

// end FLA_Apply_Q_UT_rnbr.h
// begin FLA_Apply_Q_UT_rnfc.h


FLA_Error FLA_Apply_Q_UT_rnfc_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rnfc_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rnfc_blk_var3( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );

// end FLA_Apply_Q_UT_rnfc.h
// begin FLA_Apply_Q_UT_rnfr.h


FLA_Error FLA_Apply_Q_UT_rnfr_blk_var1( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rnfr_blk_var2( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rnfr_blk_var3( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );

// end FLA_Apply_Q_UT_rnfr.h

FLA_Error FLA_Apply_Q_UT_internal( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );

FLA_Error FLA_Apply_Q_UT_lnfc( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lnfr( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lnbc( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lnbr( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lhfc( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lhfr( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lhbc( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_lhbr( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );

FLA_Error FLA_Apply_Q_UT_rhbc( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rhbr( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rhfc( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rhfr( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rnbc( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rnbr( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rnfc( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );
FLA_Error FLA_Apply_Q_UT_rnfr( FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl );

FLA_Error FLA_Apply_Q_UT_create_workspace( FLA_Obj T, FLA_Obj B, FLA_Obj* W );
FLA_Error FLA_Apply_Q_UT_create_workspace_side( FLA_Side side, FLA_Obj T, FLA_Obj B, FLA_Obj* W );

FLA_Error FLASH_Apply_Q_UT( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B );
FLA_Error FLASH_Apply_Q_UT_create_workspace( FLA_Obj TW, FLA_Obj B, FLA_Obj* W );

// end FLA_Apply_Q_UT.h
// begin FLA_Apply_Q2_UT.h


// begin FLA_Apply_Q2_UT_lhfc.h


FLA_Error FLA_Apply_Q2_UT_lhfc_blk_var1( FLA_Obj D, FLA_Obj T, FLA_Obj W1, FLA_Obj C, 
                                                                           FLA_Obj E, fla_apq2ut_t* cntl );
FLA_Error FLA_Apply_Q2_UT_lhfc_blk_var2( FLA_Obj D, FLA_Obj T, FLA_Obj W1, FLA_Obj C, 
                                                                           FLA_Obj E, fla_apq2ut_t* cntl );
FLA_Error FLA_Apply_Q2_UT_lhfc_blk_var3( FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, 
                                                                          FLA_Obj E, fla_apq2ut_t* cntl );

// end FLA_Apply_Q2_UT_lhfc.h
// begin FLA_Apply_Q2_UT_lnfc.h


FLA_Error FLA_Apply_Q2_UT_lnfc_blk_var1( FLA_Obj D, FLA_Obj T, FLA_Obj W1, FLA_Obj C, 
                                                                           FLA_Obj E, fla_apq2ut_t* cntl );
FLA_Error FLA_Apply_Q2_UT_lnfc_blk_var2( FLA_Obj D, FLA_Obj T, FLA_Obj W1, FLA_Obj C, 
                                                                           FLA_Obj E, fla_apq2ut_t* cntl );
FLA_Error FLA_Apply_Q2_UT_lnfc_blk_var3( FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, 
                                                                          FLA_Obj E, fla_apq2ut_t* cntl );

// end FLA_Apply_Q2_UT_lnfc.h

FLA_Error FLASH_Apply_Q2_UT( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev,
                             FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C,
                                                              FLA_Obj E );

FLA_Error FLA_Apply_Q2_UT_internal( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev,
                                    FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C,
                                                                     FLA_Obj E,
                                    fla_apq2ut_t* cntl );

FLA_Error FLA_Apply_Q2_UT_lhfc( FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C,
                                                                 FLA_Obj E,
                                fla_apq2ut_t* cntl );
FLA_Error FLA_Apply_Q2_UT_lnfc( FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C,
                                                                 FLA_Obj E,
                                fla_apq2ut_t* cntl );
// end FLA_Apply_Q2_UT.h
// begin FLA_Apply_CAQ2_UT.h


// begin FLA_Apply_CAQ2_UT_lhfc.h


FLA_Error FLA_Apply_CAQ2_UT_lhfc_blk_var1( FLA_Obj D, FLA_Obj T, FLA_Obj W1, FLA_Obj C, 
                                                                             FLA_Obj E, fla_apcaq2ut_t* cntl );
FLA_Error FLA_Apply_CAQ2_UT_lhfc_blk_var2( FLA_Obj D, FLA_Obj T, FLA_Obj W1, FLA_Obj C, 
                                                                             FLA_Obj E, fla_apcaq2ut_t* cntl );
FLA_Error FLA_Apply_CAQ2_UT_lhfc_blk_var3( FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, 
                                                                            FLA_Obj E, fla_apcaq2ut_t* cntl );

// end FLA_Apply_CAQ2_UT_lhfc.h

FLA_Error FLA_Apply_CAQ2_UT_internal( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev,
                                      FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C,
                                                                       FLA_Obj E,
                                      fla_apcaq2ut_t* cntl );

FLA_Error FLA_Apply_CAQ2_UT_lhfc( FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C,
                                                                   FLA_Obj E,
                                  fla_apcaq2ut_t* cntl );
// end FLA_Apply_CAQ2_UT.h
// begin FLA_Apply_QUD_UT.h


// begin FLA_Apply_QUD_UT_lhfc.h


FLA_Error FLA_Apply_QUD_UT_lhfc_blk_var1( FLA_Obj T, FLA_Obj W,
                                                     FLA_Obj R,
                                          FLA_Obj U, FLA_Obj C,
                                          FLA_Obj V, FLA_Obj D, fla_apqudut_t* cntl );

FLA_Error FLA_Apply_QUD_UT_lhfc_blk_var2( FLA_Obj T, FLA_Obj W,
                                                     FLA_Obj R,
                                          FLA_Obj U, FLA_Obj C,
                                          FLA_Obj V, FLA_Obj D, fla_apqudut_t* cntl );

FLA_Error FLA_Apply_QUD_UT_lhfc_blk_var3( FLA_Obj T, FLA_Obj W,
                                                     FLA_Obj R,
                                          FLA_Obj U, FLA_Obj C,
                                          FLA_Obj V, FLA_Obj D, fla_apqudut_t* cntl );

// end FLA_Apply_QUD_UT_lhfc.h

FLA_Error FLA_Apply_QUD_UT( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev,
                            FLA_Obj T, FLA_Obj W,
                                       FLA_Obj R,
                            FLA_Obj U, FLA_Obj C,
                            FLA_Obj V, FLA_Obj D );

FLA_Error FLA_Apply_QUD_UT_internal( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev,
                                     FLA_Obj T, FLA_Obj W,
                                                FLA_Obj R,
                                     FLA_Obj U, FLA_Obj C,
                                     FLA_Obj V, FLA_Obj D, fla_apqudut_t* cntl );

FLA_Error FLA_Apply_QUD_UT_lhfc( FLA_Obj T, FLA_Obj W,
                                            FLA_Obj R,
                                 FLA_Obj U, FLA_Obj C,
                                 FLA_Obj V, FLA_Obj D, fla_apqudut_t* cntl );

FLA_Error FLA_Apply_QUD_UT_create_workspace( FLA_Obj T, FLA_Obj R, FLA_Obj* W );

// end FLA_Apply_QUD_UT.h
// begin FLA_Apply_Q_UT_inc.h


// begin FLA_Apply_Q_UT_inc_lhfc.h


FLA_Error FLA_Apply_Q_UT_inc_lhfc_blk_var1( FLA_Obj A, FLA_Obj TW, FLA_Obj W1, FLA_Obj B, fla_apqutinc_t* cntl );

// end FLA_Apply_Q_UT_inc_lhfc.h
// begin FLA_Apply_Q_UT_inc_lnfc.h


FLA_Error FLA_Apply_Q_UT_inc_lnfc_blk_var1( FLA_Obj A, FLA_Obj TW, FLA_Obj W1, FLA_Obj B, fla_apqutinc_t* cntl );

// end FLA_Apply_Q_UT_inc_lnfc.h

FLA_Error FLASH_Apply_Q_UT_inc( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev,
                                FLA_Obj A, FLA_Obj TW, FLA_Obj W1, FLA_Obj B );

FLA_Error FLASH_Apply_Q_UT_inc_create_workspace( FLA_Obj TW, FLA_Obj B, FLA_Obj* W );

FLA_Error FLA_Apply_Q_UT_inc_internal( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev,
                                       FLA_Obj A, FLA_Obj TW, FLA_Obj W1, FLA_Obj B,
                                       fla_apqutinc_t* cntl );

FLA_Error FLA_Apply_Q_UT_inc_lhfc( FLA_Obj A, FLA_Obj TW, FLA_Obj W1, FLA_Obj B,
                                   fla_apqutinc_t* cntl );
FLA_Error FLA_Apply_Q_UT_inc_lnfc( FLA_Obj A, FLA_Obj TW, FLA_Obj W1, FLA_Obj B,
                                   fla_apqutinc_t* cntl );

// end FLA_Apply_Q_UT_inc.h
// begin FLA_Apply_CAQ_UT_inc.h


// begin FLA_Apply_CAQ_UT_inc_lhfc.h


FLA_Error FLA_Apply_CAQ_UT_inc_lhfc_blk_var1( FLA_Obj R, FLA_Obj TW, FLA_Obj W1, FLA_Obj B, fla_apcaqutinc_t* cntl );

// end FLA_Apply_CAQ_UT_inc_lhfc.h

FLA_Error FLASH_Apply_CAQ_UT_inc( dim_t p,
                                  FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev,
                                  FLA_Obj A, FLA_Obj ATW,
                                  FLA_Obj R, FLA_Obj RTW, FLA_Obj W, FLA_Obj B );

FLA_Error FLA_Apply_CAQ_UT_inc_apply_panels( dim_t nb_part, FLA_Obj A, FLA_Obj ATW, FLA_Obj W, FLA_Obj B );

FLA_Error FLASH_Apply_CAQ_UT_inc_create_workspace( dim_t p, FLA_Obj TW, FLA_Obj B, FLA_Obj* W );

FLA_Error FLA_Apply_CAQ_UT_inc_internal( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev,
                                         FLA_Obj R, FLA_Obj TW, FLA_Obj W1, FLA_Obj B,
                                         fla_apcaqutinc_t* cntl );

FLA_Error FLA_Apply_CAQ_UT_inc_lhfc( FLA_Obj R, FLA_Obj TW, FLA_Obj W1, FLA_Obj B,
                                     fla_apcaqutinc_t* cntl );

// end FLA_Apply_CAQ_UT_inc.h
// begin FLA_Apply_QUD_UT_inc.h


// begin FLA_Apply_QUD_UT_inc_lhfc.h


FLA_Error FLA_Apply_QUD_UT_inc_lhfc_blk_var1( FLA_Obj T, FLA_Obj W,
                                                         FLA_Obj B,
                                              FLA_Obj U, FLA_Obj C,
                                              FLA_Obj V, FLA_Obj D, fla_apqudutinc_t* cntl );
// end FLA_Apply_QUD_UT_inc_lhfc.h

FLA_Error FLASH_Apply_QUD_UT_inc( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev,
                                  FLA_Obj T, FLA_Obj W,
                                             FLA_Obj R,
                                  FLA_Obj U, FLA_Obj C,
                                  FLA_Obj V, FLA_Obj D );

FLA_Error FLA_Apply_QUD_UT_inc_internal( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev,
                                         FLA_Obj T, FLA_Obj W,
                                                    FLA_Obj R,
                                         FLA_Obj U, FLA_Obj C,
                                         FLA_Obj V, FLA_Obj D, fla_apqudutinc_t* cntl );

FLA_Error FLA_Apply_QUD_UT_inc_lhfc( FLA_Obj T, FLA_Obj W,
                                                FLA_Obj R,
                                     FLA_Obj U, FLA_Obj C,
                                     FLA_Obj V, FLA_Obj D, fla_apqudutinc_t* cntl );

FLA_Error FLASH_Apply_QUD_UT_inc_create_workspace( FLA_Obj T, FLA_Obj R, FLA_Obj* W );

// end FLA_Apply_QUD_UT_inc.h
// begin FLA_Apply_pivots.h


// begin FLA_Apply_pivots_ln.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Apply_pivots_ln_blk_var1( FLA_Obj p, FLA_Obj A, fla_appiv_t* cntl );
FLA_Error FLA_Apply_pivots_ln_blk_var2( FLA_Obj p, FLA_Obj A, fla_appiv_t* cntl );

FLA_Error FLA_Apply_pivots_ln_opt_var1( FLA_Obj p, FLA_Obj A );
FLA_Error FLA_Apply_pivots_ln_opi_var1( int n, 
                                        int*      a, int a_rs, int a_cs, 
                                        int k1, 
                                        int k2, 
                                        int* p, int incp );
FLA_Error FLA_Apply_pivots_ln_ops_var1( int n, 
                                        float*    a, int a_rs, int a_cs, 
                                        int k1, 
                                        int k2, 
                                        int* p, int incp );
FLA_Error FLA_Apply_pivots_ln_opd_var1( int n, 
                                        double*   a, int a_rs, int a_cs, 
                                        int k1, 
                                        int k2, 
                                        int* p, int incp );
FLA_Error FLA_Apply_pivots_ln_opc_var1( int n, 
                                        scomplex* a, int a_rs, int a_cs, 
                                        int k1, 
                                        int k2, 
                                        int* p, int incp );
FLA_Error FLA_Apply_pivots_ln_opz_var1( int n, 
                                        dcomplex* a, int a_rs, int a_cs, 
                                        int k1, 
                                        int k2, 
                                        int* p, int incp );
// end FLA_Apply_pivots_ln.h
// begin FLA_Apply_pivots_lt.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Apply_pivots_lt_opt_var1( FLA_Obj p, FLA_Obj A );

// end FLA_Apply_pivots_lt.h
// begin FLA_Apply_pivots_rn.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Apply_pivots_rn_opt_var1( FLA_Obj p, FLA_Obj A );
FLA_Error FLA_Apply_pivots_rn_ops_var1( int n, 
                                        float*    a, int a_rs, int a_cs, 
                                        int k1, 
                                        int k2, 
                                        int* p, int incp );
FLA_Error FLA_Apply_pivots_rn_opd_var1( int n, 
                                        double*   a, int a_rs, int a_cs, 
                                        int k1, 
                                        int k2, 
                                        int* p, int incp );
FLA_Error FLA_Apply_pivots_rn_opc_var1( int n, 
                                        scomplex* a, int a_rs, int a_cs, 
                                        int k1, 
                                        int k2, 
                                        int* p, int incp );
FLA_Error FLA_Apply_pivots_rn_opz_var1( int n, 
                                        dcomplex* a, int a_rs, int a_cs, 
                                        int k1, 
                                        int k2, 
                                        int* p, int incp );
// end FLA_Apply_pivots_rn.h
// begin FLA_Apply_pivots_rt.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Apply_pivots_rt_opt_var1( FLA_Obj p, FLA_Obj A );

// end FLA_Apply_pivots_rt.h

FLA_Error FLA_Apply_pivots_internal( FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A, fla_appiv_t* cntl );

FLA_Error FLA_Apply_pivots_ln( FLA_Obj p, FLA_Obj A, fla_appiv_t* cntl );
FLA_Error FLA_Apply_pivots_lt( FLA_Obj p, FLA_Obj A, fla_appiv_t* cntl );
FLA_Error FLA_Apply_pivots_rn( FLA_Obj p, FLA_Obj A, fla_appiv_t* cntl );
FLA_Error FLA_Apply_pivots_rt( FLA_Obj p, FLA_Obj A, fla_appiv_t* cntl );

// end FLA_Apply_pivots.h

// Eigensolvers
// begin FLA_Eig_gest.h


// begin FLA_Eig_gest_il.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Eig_gest_il_blk_var1( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_il_blk_var2( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_il_blk_var3( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_il_blk_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_il_blk_var5( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );

FLA_Error FLA_Eig_gest_il_unb_var1( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_il_unb_var2( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_il_unb_var3( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_il_unb_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_il_unb_var5( FLA_Obj A, FLA_Obj Y, FLA_Obj B );

FLA_Error FLA_Eig_gest_il_opt_var1( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_il_ops_var1( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_y, int inc_y, 
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_il_opd_var1( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_y, int inc_y, 
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_il_opc_var1( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_y, int inc_y, 
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_il_opz_var1( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_y, int inc_y, 
                                    dcomplex* buff_B, int rs_B, int cs_B );

FLA_Error FLA_Eig_gest_il_opt_var2( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_il_ops_var2( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_y, int inc_y, 
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_il_opd_var2( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_y, int inc_y, 
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_il_opc_var2( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_y, int inc_y, 
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_il_opz_var2( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_y, int inc_y, 
                                    dcomplex* buff_B, int rs_B, int cs_B );

FLA_Error FLA_Eig_gest_il_opt_var3( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_il_ops_var3( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_Y, int rs_Y, int cs_Y,
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_il_opd_var3( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_Y, int rs_Y, int cs_Y,
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_il_opc_var3( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_Y, int rs_Y, int cs_Y,
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_il_opz_var3( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_Y, int rs_Y, int cs_Y,
                                    dcomplex* buff_B, int rs_B, int cs_B );

FLA_Error FLA_Eig_gest_il_opt_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_il_ops_var4( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_y, int inc_y, 
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_il_opd_var4( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_y, int inc_y, 
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_il_opc_var4( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_y, int inc_y, 
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_il_opz_var4( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_y, int inc_y, 
                                    dcomplex* buff_B, int rs_B, int cs_B );

FLA_Error FLA_Eig_gest_il_opt_var5( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_il_ops_var5( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_y, int inc_y, 
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_il_opd_var5( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_y, int inc_y, 
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_il_opc_var5( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_y, int inc_y, 
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_il_opz_var5( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_y, int inc_y, 
                                    dcomplex* buff_B, int rs_B, int cs_B );
// end FLA_Eig_gest_il.h
// begin FLA_Eig_gest_iu.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Eig_gest_iu_blk_var1( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_iu_blk_var2( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_iu_blk_var3( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_iu_blk_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_iu_blk_var5( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );

FLA_Error FLA_Eig_gest_iu_unb_var1( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_iu_unb_var2( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_iu_unb_var3( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_iu_unb_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_iu_unb_var5( FLA_Obj A, FLA_Obj Y, FLA_Obj B );

FLA_Error FLA_Eig_gest_iu_opt_var1( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_iu_ops_var1( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_y, int inc_y, 
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_iu_opd_var1( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_y, int inc_y, 
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_iu_opc_var1( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_y, int inc_y, 
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_iu_opz_var1( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_y, int inc_y, 
                                    dcomplex* buff_B, int rs_B, int cs_B );

FLA_Error FLA_Eig_gest_iu_opt_var2( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_iu_ops_var2( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_y, int inc_y, 
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_iu_opd_var2( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_y, int inc_y, 
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_iu_opc_var2( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_y, int inc_y, 
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_iu_opz_var2( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_y, int inc_y, 
                                    dcomplex* buff_B, int rs_B, int cs_B );

FLA_Error FLA_Eig_gest_iu_opt_var3( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_iu_ops_var3( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_Y, int rs_Y, int cs_Y,
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_iu_opd_var3( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_Y, int rs_Y, int cs_Y,
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_iu_opc_var3( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_Y, int rs_Y, int cs_Y,
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_iu_opz_var3( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_Y, int rs_Y, int cs_Y,
                                    dcomplex* buff_B, int rs_B, int cs_B );

FLA_Error FLA_Eig_gest_iu_opt_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_iu_ops_var4( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_y, int inc_y, 
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_iu_opd_var4( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_y, int inc_y, 
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_iu_opc_var4( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_y, int inc_y, 
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_iu_opz_var4( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_y, int inc_y, 
                                    dcomplex* buff_B, int rs_B, int cs_B );

FLA_Error FLA_Eig_gest_iu_opt_var5( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_iu_ops_var5( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_y, int inc_y, 
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_iu_opd_var5( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_y, int inc_y, 
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_iu_opc_var5( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_y, int inc_y, 
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_iu_opz_var5( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_y, int inc_y, 
                                    dcomplex* buff_B, int rs_B, int cs_B );
// end FLA_Eig_gest_iu.h
// begin FLA_Eig_gest_nl.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Eig_gest_nl_blk_var1( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_nl_blk_var2( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_nl_blk_var3( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_nl_blk_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_nl_blk_var5( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );

FLA_Error FLA_Eig_gest_nl_unb_var1( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_nl_unb_var2( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_nl_unb_var3( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_nl_unb_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_nl_unb_var5( FLA_Obj A, FLA_Obj Y, FLA_Obj B );

FLA_Error FLA_Eig_gest_nl_opt_var1( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_nl_ops_var1( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_y, int inc_y,
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nl_opd_var1( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_y, int inc_y,
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nl_opc_var1( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_y, int inc_y,
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nl_opz_var1( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_y, int inc_y,
                                    dcomplex* buff_B, int rs_B, int cs_B );

FLA_Error FLA_Eig_gest_nl_opt_var2( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_nl_ops_var2( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_y, int inc_y,
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nl_opd_var2( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_y, int inc_y,
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nl_opc_var2( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_y, int inc_y,
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nl_opz_var2( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_y, int inc_y,
                                    dcomplex* buff_B, int rs_B, int cs_B );

FLA_Error FLA_Eig_gest_nl_opt_var3( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_nl_ops_var3( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_y, int inc_y,
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nl_opd_var3( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_y, int inc_y,
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nl_opc_var3( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_y, int inc_y,
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nl_opz_var3( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_y, int inc_y,
                                    dcomplex* buff_B, int rs_B, int cs_B );

FLA_Error FLA_Eig_gest_nl_opt_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_nl_ops_var4( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_y, int inc_y, 
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nl_opd_var4( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_y, int inc_y, 
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nl_opc_var4( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_y, int inc_y, 
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nl_opz_var4( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_y, int inc_y, 
                                    dcomplex* buff_B, int rs_B, int cs_B );

FLA_Error FLA_Eig_gest_nl_opt_var5( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_nl_ops_var5( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_y, int inc_y, 
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nl_opd_var5( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_y, int inc_y, 
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nl_opc_var5( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_y, int inc_y, 
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nl_opz_var5( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_y, int inc_y, 
                                    dcomplex* buff_B, int rs_B, int cs_B );

// end FLA_Eig_gest_nl.h
// begin FLA_Eig_gest_nu.h


// skipped #include "FLAME.h" 

FLA_Error FLA_Eig_gest_nu_blk_var1( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_nu_blk_var2( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_nu_blk_var3( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_nu_blk_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_nu_blk_var5( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );

FLA_Error FLA_Eig_gest_nu_unb_var1( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_nu_unb_var2( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_nu_unb_var3( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_nu_unb_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_nu_unb_var5( FLA_Obj A, FLA_Obj Y, FLA_Obj B );

FLA_Error FLA_Eig_gest_nu_opt_var1( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_nu_ops_var1( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_y, int inc_y,
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nu_opd_var1( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_y, int inc_y,
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nu_opc_var1( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_y, int inc_y,
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nu_opz_var1( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_y, int inc_y,
                                    dcomplex* buff_B, int rs_B, int cs_B );

FLA_Error FLA_Eig_gest_nu_opt_var2( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_nu_ops_var2( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_y, int inc_y,
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nu_opd_var2( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_y, int inc_y,
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nu_opc_var2( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_y, int inc_y,
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nu_opz_var2( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_y, int inc_y,
                                    dcomplex* buff_B, int rs_B, int cs_B );

FLA_Error FLA_Eig_gest_nu_opt_var3( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_nu_ops_var3( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_y, int inc_y, 
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nu_opd_var3( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_y, int inc_y, 
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nu_opc_var3( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_y, int inc_y, 
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nu_opz_var3( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_y, int inc_y, 
                                    dcomplex* buff_B, int rs_B, int cs_B );

FLA_Error FLA_Eig_gest_nu_opt_var4( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_nu_ops_var4( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_y, int inc_y, 
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nu_opd_var4( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_y, int inc_y, 
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nu_opc_var4( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_y, int inc_y, 
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nu_opz_var4( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_y, int inc_y, 
                                    dcomplex* buff_B, int rs_B, int cs_B );

FLA_Error FLA_Eig_gest_nu_opt_var5( FLA_Obj A, FLA_Obj Y, FLA_Obj B );
FLA_Error FLA_Eig_gest_nu_ops_var5( int m_AB,
                                    float*    buff_A, int rs_A, int cs_A, 
                                    float*    buff_y, int inc_y, 
                                    float*    buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nu_opd_var5( int m_AB,
                                    double*   buff_A, int rs_A, int cs_A, 
                                    double*   buff_y, int inc_y, 
                                    double*   buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nu_opc_var5( int m_AB,
                                    scomplex* buff_A, int rs_A, int cs_A, 
                                    scomplex* buff_y, int inc_y, 
                                    scomplex* buff_B, int rs_B, int cs_B );
FLA_Error FLA_Eig_gest_nu_opz_var5( int m_AB,
                                    dcomplex* buff_A, int rs_A, int cs_A, 
                                    dcomplex* buff_y, int inc_y, 
                                    dcomplex* buff_B, int rs_B, int cs_B );

// end FLA_Eig_gest_nu.h

FLA_Error FLA_Eig_gest_internal( FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_il( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_iu( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_nl( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );
FLA_Error FLA_Eig_gest_nu( FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl );

// end FLA_Eig_gest.h
// end FLA_lapack_var_prototypes.h

  // Include FLASH headers.
// begin FLASH.h


#ifndef FLASH_H
#define FLASH_H

// begin FLASH_macro_defs.h


#ifndef FLASH_MACRO_DEFS_H
#define FLASH_MACRO_DEFS_H

#define FLASH_OBJ_PTR_AT( A )  ( (FLA_Obj *) FLA_Obj_buffer_at_view( A ) )

#define FLA_FLAT_TO_HIER 4000
#define FLA_HIER_TO_FLAT 4001

#endif
// end FLASH_macro_defs.h

// begin FLASH_main_prototypes.h


// -----------------------------------------------------------------------------

FLA_Error    FLASH_Obj_blocksizes_check( FLA_Obj H, dim_t* b_m, dim_t* b_n );

FLA_Error    FLASH_Obj_create_helper_check( FLA_Bool without_buffer, FLA_Datatype datatype, dim_t m, dim_t n, dim_t depth, dim_t* b_m, dim_t* b_n, FLA_Obj* H );
FLA_Error    FLASH_Obj_create_hierarchy_check( FLA_Datatype datatype, dim_t m, dim_t n, dim_t depth, dim_t* elem_sizes_m, dim_t* elem_sizes_n, FLA_Obj flat_matrix, FLA_Obj* H, unsigned long id, dim_t depth_overall, dim_t* depth_sizes_m, dim_t* depth_sizes_n, dim_t* m_offsets, dim_t* n_offsets );

FLA_Error    FLASH_Obj_create_conf_to_check( FLA_Trans trans, FLA_Obj H_cur, FLA_Obj* H_new );

FLA_Error    FLASH_Obj_create_hier_conf_to_flat_check( FLA_Trans trans, FLA_Obj F, dim_t depth, dim_t* b_mn, FLA_Obj* H );
FLA_Error    FLASH_Obj_create_hier_conf_to_flat_ext_check( FLA_Trans trans, FLA_Obj F, dim_t depth, dim_t* b_m, dim_t* b_n, FLA_Obj* H );
FLA_Error    FLASH_Obj_create_flat_conf_to_hier_check( FLA_Trans trans, FLA_Obj H, FLA_Obj* F );
FLA_Error    FLASH_Obj_create_hier_copy_of_flat_check( FLA_Obj F, dim_t depth, dim_t* b_mn, FLA_Obj* H );
FLA_Error    FLASH_Obj_create_hier_copy_of_flat_ext_check( FLA_Obj F, dim_t depth, dim_t* b_m, dim_t* b_n, FLA_Obj* H );
FLA_Error    FLASH_Obj_create_flat_copy_of_hier_check( FLA_Obj H, FLA_Obj* F );

FLA_Error    FLASH_Obj_free_check( FLA_Obj* H );
FLA_Error    FLASH_Obj_free_without_buffer_check( FLA_Obj* H );
FLA_Error    FLASH_Obj_free_hierarchy_check( FLA_Obj* H );

FLA_Error    FLASH_Obj_attach_buffer_check( void *buffer, dim_t rs, dim_t cs, FLA_Obj* H );
FLA_Error    FLASH_Obj_attach_buffer_hierarchy_check( FLA_Obj F, FLA_Obj* H );

// -----------------------------------------------------------------------------

FLA_Error FLASH_Part_create_2x1( FLA_Obj A,    FLA_Obj* AT,
                                               FLA_Obj* AB,
                                 dim_t n_rows, FLA_Side side );
FLA_Error FLASH_Part_create_1x2( FLA_Obj A,    FLA_Obj* AL, FLA_Obj* AR,
                                 dim_t n_cols, FLA_Side side );
FLA_Error FLASH_Part_create_2x2( FLA_Obj A,    FLA_Obj* ATL, FLA_Obj* ATR,
                                               FLA_Obj* ABL, FLA_Obj* ABR,
                                 dim_t n_rows, dim_t n_cols, FLA_Side side );

FLA_Error FLASH_Part_free_2x1( FLA_Obj* AT,
                               FLA_Obj* AB );
FLA_Error FLASH_Part_free_1x2( FLA_Obj* AL, FLA_Obj* AR );
FLA_Error FLASH_Part_free_2x2( FLA_Obj* ATL, FLA_Obj* ATR,
                               FLA_Obj* ABL, FLA_Obj* ABR );

FLA_Error FLASH_Obj_adjust_views( FLA_Bool attach_buffer, dim_t offm, dim_t offn, dim_t m, dim_t n, FLA_Obj A, FLA_Obj* S );
FLA_Error FLASH_Obj_adjust_views_hierarchy( FLA_Bool attach_buffer, dim_t offm, dim_t offn, dim_t m, dim_t n, FLA_Obj A, FLA_Obj* S );

dim_t FLASH_Obj_scalar_length( FLA_Obj H );
dim_t FLASH_Obj_scalar_width( FLA_Obj H );
dim_t FLASH_Obj_scalar_min_dim( FLA_Obj H );
dim_t FLASH_Obj_scalar_max_dim( FLA_Obj H );
dim_t FLASH_Obj_scalar_vector_dim( FLA_Obj H );
dim_t FLASH_Obj_scalar_row_offset( FLA_Obj H );
dim_t FLASH_Obj_scalar_col_offset( FLA_Obj H );
dim_t FLASH_Obj_scalar_length_tl( FLA_Obj H );
dim_t FLASH_Obj_scalar_width_tl( FLA_Obj H );
dim_t FLASH_Obj_base_scalar_length( FLA_Obj H );
dim_t FLASH_Obj_base_scalar_width( FLA_Obj H );

FLA_Error FLASH_Obj_show( char* header, FLA_Obj H, char* elem_format, char* footer );
FLA_Error FLASH_Obj_show_hierarchy( FLA_Obj H, dim_t i, char* elem_format );

// -----------------------------------------------------------------------------

FLA_Error    FLASH_Axpy_buffer_to_hier( FLA_Obj alpha, dim_t m, dim_t n, void* buffer, dim_t rs, dim_t cs, dim_t i, dim_t j, FLA_Obj H );
FLA_Error    FLASH_Axpy_hier_to_buffer( FLA_Obj alpha, dim_t i, dim_t j, FLA_Obj H, dim_t m, dim_t n, void* buffer, dim_t rs, dim_t cs );
FLA_Error    FLASH_Axpy_flat_to_hier( FLA_Obj alpha, FLA_Obj F, dim_t i, dim_t j, FLA_Obj H );
FLA_Error    FLASH_Axpy_hier_to_flat( FLA_Obj alpha, dim_t i, dim_t j, FLA_Obj H, FLA_Obj F );

FLA_Error    FLASH_Axpy_hierarchy( int direction, FLA_Obj alpha, FLA_Obj F, FLA_Obj* H );

// -----------------------------------------------------------------------------

FLA_Error    FLASH_Copy_buffer_to_hier( dim_t m, dim_t n, void* buffer, dim_t rs, dim_t cs, dim_t i, dim_t j, FLA_Obj H );
FLA_Error    FLASH_Copy_hier_to_buffer( dim_t i, dim_t j, FLA_Obj H, dim_t m, dim_t n, void* buffer, dim_t rs, dim_t cs );
FLA_Error    FLASH_Copy_flat_to_hier( FLA_Obj F, dim_t i, dim_t j, FLA_Obj H );
FLA_Error    FLASH_Copy_hier_to_flat( dim_t i, dim_t j, FLA_Obj H, FLA_Obj F );

FLA_Error    FLASH_Copy_hierarchy( int direction, FLA_Obj F, FLA_Obj* H );

// -----------------------------------------------------------------------------

FLA_Datatype FLASH_Obj_datatype( FLA_Obj H );
dim_t        FLASH_Obj_depth( FLA_Obj H );
dim_t        FLASH_Obj_blocksizes( FLA_Obj H, dim_t* b_m, dim_t* b_n );

FLA_Error    FLASH_Obj_create( FLA_Datatype datatype, dim_t m, dim_t n, dim_t depth, dim_t* b_mn, FLA_Obj* H );
FLA_Error    FLASH_Obj_create_ext( FLA_Datatype datatype, dim_t m, dim_t n, dim_t depth, dim_t* b_m, dim_t* b_n, FLA_Obj* H );
FLA_Error    FLASH_Obj_create_without_buffer( FLA_Datatype datatype, dim_t m, dim_t n, dim_t depth, dim_t* b_mn, FLA_Obj* H );
FLA_Error    FLASH_Obj_create_without_buffer_ext( FLA_Datatype datatype, dim_t m, dim_t n, dim_t depth, dim_t* b_m, dim_t* b_n, FLA_Obj* H );

FLA_Error    FLASH_Obj_create_helper( FLA_Bool without_buffer, FLA_Datatype datatype, dim_t m, dim_t n, dim_t depth, dim_t* b_m, dim_t* b_n, FLA_Obj* H );
FLA_Error    FLASH_Obj_create_hierarchy( FLA_Datatype datatype, dim_t m, dim_t n, dim_t depth, dim_t* elem_sizes_m, dim_t* elem_sizes_n, FLA_Obj flat_matrix, FLA_Obj* H, unsigned long id, dim_t depth_overall, dim_t* depth_sizes_m, dim_t* depth_sizes_n, dim_t* m_offsets, dim_t* n_offsets );

FLA_Error    FLASH_Obj_create_conf_to( FLA_Trans trans, FLA_Obj H_cur, FLA_Obj* H_new );
FLA_Error    FLASH_Obj_create_hier_conf_to_flat( FLA_Trans trans, FLA_Obj F, dim_t depth, dim_t* b_mn, FLA_Obj* H );
FLA_Error    FLASH_Obj_create_hier_conf_to_flat_ext( FLA_Trans trans, FLA_Obj F, dim_t depth, dim_t* b_m, dim_t* b_n, FLA_Obj* H );
FLA_Error    FLASH_Obj_create_flat_conf_to_hier( FLA_Trans trans, FLA_Obj H, FLA_Obj* F );
FLA_Error    FLASH_Obj_create_copy_of( FLA_Trans trans, FLA_Obj H_cur, FLA_Obj* H_new );
FLA_Error    FLASH_Obj_create_hier_copy_of_flat( FLA_Obj F, dim_t depth, dim_t* b_mn, FLA_Obj* H );
FLA_Error    FLASH_Obj_create_hier_copy_of_flat_ext( FLA_Obj F, dim_t depth, dim_t* b_m, dim_t* b_n, FLA_Obj* H );
FLA_Error    FLASH_Obj_create_flat_copy_of_hier( FLA_Obj H, FLA_Obj* F );

void         FLASH_Obj_free( FLA_Obj* H );
void         FLASH_Obj_free_hierarchy( FLA_Obj* H );
void         FLASH_Obj_free_without_buffer( FLA_Obj* H );

FLA_Error    FLASH_Obj_attach_buffer( void* buffer, dim_t rs, dim_t cs, FLA_Obj* H );
FLA_Error    FLASH_Obj_attach_buffer_hierarchy( FLA_Obj F, FLA_Obj* H );

FLA_Error    FLASH_Obj_flatten( FLA_Obj H, FLA_Obj F );
FLA_Error    FLASH_Obj_hierarchify( FLA_Obj F, FLA_Obj H );

void*        FLASH_Obj_extract_buffer( FLA_Obj H );

FLA_Error    FLASH_Obj_show( char* header, FLA_Obj H, char* elem_format, char* footer );

void         FLASH_print_struct( FLA_Obj H );
void         FLASH_print_struct_helper( FLA_Obj H, int indent );
// end FLASH_main_prototypes.h
// begin FLASH_util_prototypes.h


// --- FLASH utility routine prototypes ----------------------------------------

double    FLASH_Max_elemwise_diff( FLA_Obj A, FLA_Obj B );

FLA_Error FLASH_Random_matrix( FLA_Obj H );
FLA_Error FLASH_Random_spd_matrix( FLA_Uplo uplo, FLA_Obj H );

FLA_Error FLASH_Norm1( FLA_Obj H, FLA_Obj norm );
FLA_Error FLASH_Obj_shift_diagonal( FLA_Conj conj, FLA_Obj sigma, FLA_Obj H );

FLA_Error FLASH_Set( FLA_Obj alpha, FLA_Obj H );

FLA_Error FLASH_Obj_create_diag_panel( FLA_Obj A, FLA_Obj* U );

FLA_Error FLASH_LU_find_zero_on_diagonal( FLA_Obj A );

FLA_Error FLASH_Triangularize( FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A );
FLA_Error FLASH_Hermitianize( FLA_Uplo uplo, FLA_Obj A );

// --- FLASH utility check routine prototypes ----------------------------------

FLA_Error FLASH_LU_find_zero_on_diagonal_check( FLA_Obj A );

// end FLASH_util_prototypes.h
// begin FLASH_blas1_prototypes.h


// --- top-level front-end prototypes ------------------------------------------

FLA_Error FLASH_Axpy( FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLASH_Axpyt( FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLASH_Copy( FLA_Obj A, FLA_Obj B );
FLA_Error FLASH_Copyt( FLA_Trans trans, FLA_Obj A, FLA_Obj B );
FLA_Error FLASH_Scal( FLA_Obj alpha, FLA_Obj A );
FLA_Error FLASH_Scalr( FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A );

// end FLASH_blas1_prototypes.h
// begin FLASH_blas2_prototypes.h


// --- top-level front-end prototypes ------------------------------------------

FLA_Error FLASH_Gemv( FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y );
FLA_Error FLASH_Trsv( FLA_Uplo uplo, FLA_Trans transa, FLA_Diag diag, FLA_Obj A, FLA_Obj x );

// end FLASH_blas2_prototypes.h
// begin FLASH_blas3_prototypes.h


// --- top-level front-end prototypes ------------------------------------------

FLA_Error FLASH_Gemm( FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLASH_Hemm( FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLASH_Herk( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLASH_Her2k( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLASH_Symm( FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLASH_Syrk( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C );
FLA_Error FLASH_Syr2k( FLA_Uplo uplo, FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C );
FLA_Error FLASH_Trmm( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );
FLA_Error FLASH_Trsm( FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj B );

// end FLASH_blas3_prototypes.h
// begin FLASH_lapack_prototypes.h


// --- top-level wrapper prototypes --------------------------------------------

// Implemented:
FLA_Error FLASH_Chol( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLASH_Chol_solve( FLA_Uplo uplo, FLA_Obj A, FLA_Obj B, FLA_Obj X );
FLA_Error FLASH_LU_nopiv( FLA_Obj A );
FLA_Error FLASH_LU_nopiv_solve( FLA_Obj A, FLA_Obj B, FLA_Obj X );
FLA_Error FLASH_LU_piv( FLA_Obj A, FLA_Obj p );
FLA_Error FLASH_LU_piv_solve( FLA_Obj A, FLA_Obj p, FLA_Obj B, FLA_Obj X );
FLA_Error FLASH_LU_incpiv( FLA_Obj A, FLA_Obj p, FLA_Obj L );
FLA_Error FLASH_FS_incpiv( FLA_Obj A, FLA_Obj p, FLA_Obj L, FLA_Obj b );
FLA_Error FLASH_Trinv( FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A );
FLA_Error FLASH_Ttmm( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLASH_SPDinv( FLA_Uplo uplo, FLA_Obj A );
FLA_Error FLASH_Sylv( FLA_Trans transa, FLA_Trans transb, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale );
FLA_Error FLASH_Apply_Q_UT( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B );
FLA_Error FLASH_Apply_Q2_UT( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E );
FLA_Error FLASH_QR2_UT( FLA_Obj B, FLA_Obj D, FLA_Obj T );
FLA_Error FLASH_QR_UT( FLA_Obj A, FLA_Obj TW );
FLA_Error FLASH_QR_UT_solve( FLA_Obj A, FLA_Obj T, FLA_Obj B, FLA_Obj X );
FLA_Error FLASH_QR_UT_inc( FLA_Obj A, FLA_Obj TW );
FLA_Error FLASH_QR_UT_inc_solve( FLA_Obj A, FLA_Obj TW, FLA_Obj B, FLA_Obj X );
FLA_Error FLASH_Apply_Q_UT_inc( FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj TW, FLA_Obj W1, FLA_Obj B );
FLA_Error FLASH_Apply_pivots( FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A );
FLA_Error FLASH_Eig_gest( FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj B );

// Not yet implemented:
FLA_Error FLASH_LQ_UT_inv( FLA_Obj A, FLA_Obj TW );
FLA_Error FLASH_LQ2_UT( FLA_Obj B, FLA_Obj C, FLA_Obj T );
// end FLASH_lapack_prototypes.h

#endif
// end FLASH.h

  // Include SuperMatrix headers.
// begin FLASH_Queue.h


#ifndef FLASH_QUEUE_H
#define FLASH_QUEUE_H


// begin FLASH_Queue_main_prototypes.h


#ifndef FLASH_QUEUE_MAIN_PROTOTYPES_H
#define FLASH_QUEUE_MAIN_PROTOTYPES_H


void           FLASH_Queue_begin( void );
void           FLASH_Queue_end( void );
unsigned int   FLASH_Queue_stack_depth( void );

FLA_Error      FLASH_Queue_enable( void );
FLA_Error      FLASH_Queue_disable( void );
FLA_Bool       FLASH_Queue_get_enabled( void );

void           FLASH_Queue_set_num_threads( unsigned int n_threads );
unsigned int   FLASH_Queue_get_num_threads( void );


#ifdef FLA_ENABLE_SUPERMATRIX


void           FLASH_Queue_init( void );
void           FLASH_Queue_finalize( void );

unsigned int   FLASH_Queue_get_num_tasks( void );

void           FLASH_Queue_set_verbose_output( FLASH_Verbose verbose );
FLASH_Verbose  FLASH_Queue_get_verbose_output( void );
void           FLASH_Queue_set_sorting( FLA_Bool sorting );
FLA_Bool       FLASH_Queue_get_sorting( void );
void           FLASH_Queue_set_caching( FLA_Bool caching );
FLA_Bool       FLASH_Queue_get_caching( void );
void           FLASH_Queue_set_work_stealing( FLA_Bool work_stealing );
FLA_Bool       FLASH_Queue_get_work_stealing( void );
void           FLASH_Queue_set_data_affinity( FLASH_Data_aff data_affinity );
FLASH_Data_aff FLASH_Queue_get_data_affinity( void );
double         FLASH_Queue_get_total_time( void );
double         FLASH_Queue_get_parallel_time( void );

void           FLASH_Queue_exec( void );


// --- helper functions -------------------------------------------------------

void           FLASH_Queue_set_parallel_time( double dtime );
void           FLASH_Queue_set_block_size( dim_t size );
dim_t          FLASH_Queue_get_block_size( void );
void           FLASH_Queue_set_cache_size( dim_t size );
dim_t          FLASH_Queue_get_cache_size( void );
void           FLASH_Queue_set_cache_line_size( dim_t size );
dim_t          FLASH_Queue_get_cache_line_size( void );
void           FLASH_Queue_set_cores_per_cache( int cores );
int            FLASH_Queue_get_cores_per_cache( void );
void           FLASH_Queue_set_cores_per_queue( int cores );
int            FLASH_Queue_get_cores_per_queue( void );
void           FLASH_Queue_reset( void );
FLASH_Task*    FLASH_Queue_get_head_task( void );
FLASH_Task*    FLASH_Queue_get_tail_task( void );
void           FLASH_Queue_push( void *func, void *cntl, char *name,
                                 FLA_Bool enabled_gpu, FLA_Bool enabled_hip,
                                 int n_int_args, int n_fla_args,
                                 int n_input_args, int n_output_args, ... );
void           FLASH_Queue_push_input( FLA_Obj obj, FLASH_Task* t );
void           FLASH_Queue_push_output( FLA_Obj obj, FLASH_Task* t );
FLASH_Task*    FLASH_Task_alloc( void *func, void *cntl, char *name,
                                 FLA_Bool enabled_gpu, FLA_Bool enabled_hip,
                                 int n_int_args, int n_fla_args,
                                 int n_input_args, int n_output_args );
void           FLASH_Task_free( FLASH_Task *t );
void           FLASH_Queue_exec_task( FLASH_Task *t );
void           FLASH_Queue_verbose_output( void );

void           FLASH_Queue_init_tasks( void *arg );
void           FLASH_Queue_wait_enqueue( FLASH_Task *t, void *arg );
FLASH_Task*    FLASH_Queue_wait_dequeue( int queue, int cache, void *arg );
FLASH_Task*    FLASH_Queue_wait_dequeue_block( int queue, int cache, void *arg );
void           FLASH_Queue_update_cache( FLASH_Task *t, void *arg );
void           FLASH_Queue_update_cache_block( FLA_Obj obj, int cache, FLA_Bool output, void *arg );
void           FLASH_Queue_prefetch( int cache, void *arg );
void           FLASH_Queue_prefetch_block( FLA_Obj obj );
FLASH_Task*    FLASH_Queue_work_stealing( int queue, void *arg );
#ifdef FLA_ENABLE_GPU
void           FLASH_Queue_create_gpu( int thread, void *arg );
void           FLASH_Queue_destroy_gpu( int thread, void *arg );
FLA_Bool       FLASH_Queue_exec_gpu( FLASH_Task *t, void *arg );
FLA_Bool       FLASH_Queue_check_gpu( FLASH_Task *t, void *arg );
FLA_Bool       FLASH_Queue_check_block_gpu( FLA_Obj obj, int thread, void *arg );
void           FLASH_Queue_update_gpu( FLASH_Task *t, void **input_arg, void **output_arg, void *arg );
void           FLASH_Queue_update_block_gpu( FLA_Obj obj, void **buffer_gpu, int thread, void *arg );
void           FLASH_Queue_mark_gpu( FLASH_Task *t, void *arg );
void           FLASH_Queue_invalidate_block_gpu( FLA_Obj obj, int thread, void *arg );
void           FLASH_Queue_flush_block_gpu( FLA_Obj obj, int thread, void *arg );
void           FLASH_Queue_flush_gpu( int thread, void *arg );
#endif
#ifdef FLA_ENABLE_HIP
void           FLASH_Queue_create_hip( int thread, void *arg );
void           FLASH_Queue_destroy_hip( int thread, void *arg );
FLA_Bool       FLASH_Queue_exec_hip( FLASH_Task *t, void *arg );
FLA_Bool       FLASH_Queue_check_hip( FLASH_Task *t, void *arg );
FLA_Bool       FLASH_Queue_check_block_hip( FLA_Obj obj, int thread, void *arg );
void           FLASH_Queue_update_hip( FLASH_Task *t, void **input_arg, void **output_arg, void *arg );
void           FLASH_Queue_update_block_hip( FLA_Obj obj, void **buffer_gpu, int thread, void *arg );
void           FLASH_Queue_mark_hip( FLASH_Task *t, void *arg );
void           FLASH_Queue_invalidate_block_hip( FLA_Obj obj, int thread, void *arg );
void           FLASH_Queue_flush_block_hip( FLA_Obj obj, int thread, void *arg );
void           FLASH_Queue_flush_hip( int thread, void *arg );
#endif
void           FLASH_Queue_exec_parallel( void *arg );
void*          FLASH_Queue_exec_parallel_function( void *arg );
FLASH_Task*    FLASH_Task_update_dependencies( FLASH_Task *t, void *arg );
FLASH_Task*    FLASH_Task_update_binding( FLASH_Task *t, FLASH_Task *r, void *arg );
void           FLASH_Task_free_parallel( FLASH_Task *t, void *arg );

void           FLASH_Queue_exec_simulation( void *arg );


#endif // FLA_ENABLE_SUPERMATRIX


#endif // FLASH_QUEUE_MAIN_PROTOTYPES_H
// end FLASH_Queue_main_prototypes.h
// begin FLASH_Queue_macro_defs.h


#ifndef FLASH_QUEUE_MACRO_DEFS_H
#define FLASH_QUEUE_MACRO_DEFS_H


#ifdef FLA_ENABLE_SUPERMATRIX


#define FLASH_OBJ_PTR_ID( A )  ( A ).base->id

// FLASH_Verbose
#define FLASH_QUEUE_VERBOSE_NONE                     0
#define FLASH_QUEUE_VERBOSE_READABLE                 1
#define FLASH_QUEUE_VERBOSE_GRAPHVIZ                 2

// FLASH_Data_aff
#define FLASH_QUEUE_AFFINITY_NONE                    0
#define FLASH_QUEUE_AFFINITY_2D_BLOCK_CYCLIC         1
#define FLASH_QUEUE_AFFINITY_1D_ROW_BLOCK_CYCLIC     2
#define FLASH_QUEUE_AFFINITY_1D_COLUMN_BLOCK_CYCLIC  3
#define FLASH_QUEUE_AFFINITY_ROUND_ROBIN             4



// LAPACK-level

#define ENQUEUE_FLASH_LU_piv_macro( A, p, cntl ) \
        FLASH_Queue_push( (void *) FLA_LU_piv_macro_task, \
                          (void *) cntl, \
                          "LU   ", \
                          FALSE, \
                          FALSE, \
                          0, 0, 0, 2, \
                          A, p )

#define ENQUEUE_FLASH_Apply_pivots_macro( side, trans, p, A, cntl ) \
        FLASH_Queue_push( (void *) FLA_Apply_pivots_macro_task, \
                          (void *) cntl, \
                          "Pivot", \
                          FALSE, \
                          FALSE, \
                          2, 0, 1, 1, \
                          side, trans, \
                          p, A )

#define ENQUEUE_FLASH_LU_piv( A, p, cntl ) \
        FLASH_Queue_push( (void *) FLA_LU_piv_task, \
                          (void *) cntl, \
                          "LU   ", \
                          FALSE, \
                          TRUE, \
                          0, 1, 0, 1, \
                          p, A )

#define ENQUEUE_FLASH_LU_piv_copy( A, p, U, cntl ) \
        FLASH_Queue_push( (void *) FLA_LU_piv_copy_task, \
                          (void *) cntl, \
                          "LU   ", \
                          FALSE, \
                          TRUE, \
                          0, 1, 0, 2, \
                          p, A, U )

#define ENQUEUE_FLASH_Trsm_piv( A, C, p, cntl ) \
        FLASH_Queue_push( (void *) FLA_Trsm_piv_task, \
                          (void *) cntl, \
                          "Trsm ", \
                          FALSE, \
                          TRUE, \
                          0, 1, 1, 1, \
                          p, A, C )

#define ENQUEUE_FLASH_SA_LU( U, D, p, L, nb_alg, cntl ) \
        FLASH_Queue_push( (void *) FLA_SA_LU_task, \
                          (void *) cntl, \
                          "SA_LU", \
                          FALSE, \
                          FALSE, \
                          1, 2, 0, 2, \
                          nb_alg, \
                          p, L, D, U )

#define ENQUEUE_FLASH_SA_FS( L, D, p, C, E, nb_alg, cntl ) \
        FLASH_Queue_push( (void *) FLA_SA_FS_task, \
                          (void *) cntl, \
                          "SA_FS", \
                          FALSE, \
                          TRUE, \
                          1, 2, 1, 2, \
                          nb_alg, \
                          L, p, D, E, C )

#define ENQUEUE_FLASH_LU_nopiv( A, cntl ) \
        FLASH_Queue_push( (void *) FLA_LU_nopiv_task, \
                          (void *) cntl, \
                          "LU   ", \
                          FALSE, \
                          FALSE, \
                          0, 0, 0, 1, \
                          A )

#define ENQUEUE_FLASH_Trinv( uplo, diag, A, cntl ) \
        FLASH_Queue_push( (void *) FLA_Trinv_task, \
                          (void *) cntl, \
                          "Trinv", \
                          FALSE, \
                          TRUE, \
                          2, 0, 0, 1, \
                          uplo, diag, \
                          A )

#define ENQUEUE_FLASH_Ttmm( uplo, A, cntl ) \
        FLASH_Queue_push( (void *) FLA_Ttmm_task, \
                          (void *) cntl, \
                          "Ttmm ", \
                          FALSE, \
                          FALSE, \
                          1, 0, 0, 1, \
                          uplo, \
                          A )

#define ENQUEUE_FLASH_Chol( uplo, A, cntl ) \
        FLASH_Queue_push( (void *) FLA_Chol_task, \
                          (void *) cntl, \
                          "Chol ", \
                          FALSE, \
                          TRUE, \
                          1, 0, 0, 1, \
                          uplo, \
                          A )

#define ENQUEUE_FLASH_Sylv( transA, transB, isgn, A, B, C, scale, cntl ) \
        FLASH_Queue_push( (void *) FLA_Sylv_task, \
                          (void *) cntl, \
                          "Sylv ", \
                          FALSE, \
                          FALSE, \
                          2, 2, 2, 1, \
                          transA, transB, \
                          isgn, scale, \
                          A, B, C )

#define ENQUEUE_FLASH_Lyap( trans, isgn, A, C, scale, cntl ) \
        FLASH_Queue_push( (void *) FLA_Lyap_task, \
                          (void *) cntl, \
                          "Lyap ", \
                          FALSE, \
                          FALSE, \
                          1, 2, 1, 1, \
                          trans, \
                          isgn, scale, \
                          A, C )

#define ENQUEUE_FLASH_QR_UT_macro( A, T, cntl ) \
        FLASH_Queue_push( (void *) FLA_QR_UT_macro_task, \
                          (void *) cntl, \
                          "QR   ", \
                          FALSE, \
                          FALSE, \
                          0, 0, 0, 2, \
                          A, T )

#define ENQUEUE_FLASH_QR_UT( A, T, cntl ) \
        FLASH_Queue_push( (void *) FLA_QR_UT_task, \
                          (void *) cntl, \
                          "QR   ", \
                          FALSE, \
                          FALSE, \
                          0, 1, 0, 1, \
                          T, A )

#define ENQUEUE_FLASH_QR_UT_copy( A, T, U, cntl ) \
        FLASH_Queue_push( (void *) FLA_QR_UT_copy_task, \
                          (void *) cntl, \
                          "QR   ", \
                          FALSE, \
                          FALSE, \
                          0, 1, 0, 2, \
                          T, A, U )

#define ENQUEUE_FLASH_QR2_UT( B, D, T, cntl ) \
        FLASH_Queue_push( (void *) FLA_QR2_UT_task, \
                          (void *) cntl, \
                          "QR2  ", \
                          FALSE, \
                          FALSE, \
                          0, 1, 0, 2, \
                          T, D, B )

#define ENQUEUE_FLASH_LQ_UT_macro( A, T, cntl ) \
        FLASH_Queue_push( (void *) FLA_LQ_UT_macro_task, \
                          (void *) cntl, \
                          "LQ   ", \
                          FALSE, \
                          FALSE, \
                          0, 0, 0, 2, \
                          A, T )

#define ENQUEUE_FLASH_CAQR2_UT( B, D, T, cntl ) \
        FLASH_Queue_push( (void *) FLA_CAQR2_UT_task, \
                          (void *) cntl, \
                          "CAQR2", \
                          FALSE, \
                          FALSE, \
                          0, 1, 0, 2, \
                          T, D, B )

#define ENQUEUE_FLASH_Apply_Q_UT( side, trans, direct, storev, A, T, W, B, cntl ) \
        FLASH_Queue_push( (void *) FLA_Apply_Q_UT_task, \
                          (void *) cntl, \
                          "ApQ  ", \
                          FALSE, \
                          FALSE, \
                          4, 1, 1, 2, \
                          side, trans, direct, storev, \
                          T, A, B, W )

#define ENQUEUE_FLASH_Apply_Q2_UT( side, trans, direct, storev, D, T, W, C, E, cntl ) \
        FLASH_Queue_push( (void *) FLA_Apply_Q2_UT_task, \
                          (void *) cntl, \
                          "ApQ2 ", \
                          FALSE, \
                          FALSE, \
                          4, 1, 1, 3, \
                          side, trans, direct, storev, \
                          T, D, E, C, W )

#define ENQUEUE_FLASH_Apply_CAQ2_UT( side, trans, direct, storev, D, T, W, C, E, cntl ) \
        FLASH_Queue_push( (void *) FLA_Apply_CAQ2_UT_task, \
                          (void *) cntl, \
                          "ApCQ2", \
                          FALSE, \
                          FALSE, \
                          4, 1, 1, 3, \
                          side, trans, direct, storev, \
                          T, D, E, C, W )

#define ENQUEUE_FLASH_UDdate_UT( R, C, D, T, cntl ) \
        FLASH_Queue_push( (void *) FLA_UDdate_UT_task, \
                          (void *) cntl, \
                          "UD   ", \
                          FALSE, \
                          FALSE, \
                          0, 0, 0, 4, \
                          R, C, D, T )

#define ENQUEUE_FLASH_Apply_QUD_UT( side, trans, direct, storev, T, W, R, U, C, V, D, cntl ) \
        FLASH_Queue_push( (void *) FLA_Apply_QUD_UT_task, \
                          (void *) cntl, \
                          "ApQUD", \
                          FALSE, \
                          FALSE, \
                          4, 0, 3, 4, \
                          side, trans, direct, storev, \
                          T, U, V, W, R, C, D )

#define ENQUEUE_FLASH_Eig_gest( inv, uplo, A, Y, B, cntl ) \
        FLASH_Queue_push( (void *) FLA_Eig_gest_task, \
                          (void *) cntl, \
                          "Eig  ", \
                          FALSE, \
                          TRUE, \
                          2, 0, 1, 2, \
                          inv, uplo, \
                          B, Y, A )

// Level-3 BLAS

#define ENQUEUE_FLASH_Gemm( transA, transB, alpha, A, B, beta, C, cntl ) \
        FLASH_Queue_push( (void *) FLA_Gemm_task, \
                          (void *) cntl, \
                          "Gemm ", \
                          TRUE, \
                          TRUE, \
                          2, 2, 2, 1, \
                          transA, transB, \
                          alpha, beta, \
                          A, B, C )

#define ENQUEUE_FLASH_Hemm( side, uplo, alpha, A, B, beta, C, cntl ) \
        FLASH_Queue_push( (void *) FLA_Hemm_task, \
                          (void *) cntl, \
                          "Hemm ", \
                          TRUE, \
                          TRUE, \
                          2, 2, 2, 1, \
                          side, uplo, \
                          alpha, beta, \
                          A, B, C )

#define ENQUEUE_FLASH_Herk( uplo, transA, alpha, A, beta, C, cntl ) \
        FLASH_Queue_push( (void *) FLA_Herk_task, \
                          (void *) cntl, \
                          "Herk ", \
                          TRUE, \
                          TRUE, \
                          2, 2, 1, 1, \
                          uplo, transA, \
                          alpha, beta, \
                          A, C )

#define ENQUEUE_FLASH_Her2k( uplo, transA, alpha, A, B, beta, C, cntl ) \
        FLASH_Queue_push( (void *) FLA_Her2k_task, \
                          (void *) cntl, \
                          "Her2k", \
                          TRUE, \
                          TRUE, \
                          2, 2, 2, 1, \
                          uplo, transA, \
                          alpha, beta, \
                          A, B, C )

#define ENQUEUE_FLASH_Symm( side, uplo, alpha, A, B, beta, C, cntl ) \
        FLASH_Queue_push( (void *) FLA_Symm_task, \
                          (void *) cntl, \
                          "Symm ", \
                          TRUE, \
                          TRUE, \
                          2, 2, 2, 1, \
                          side, uplo, \
                          alpha, beta, \
                          A, B, C )

#define ENQUEUE_FLASH_Syrk( uplo, transA, alpha, A, beta, C, cntl ) \
        FLASH_Queue_push( (void *) FLA_Syrk_task, \
                          (void *) cntl, \
                          "Syrk ", \
                          TRUE, \
                          TRUE, \
                          2, 2, 1, 1, \
                          uplo, transA, \
                          alpha, beta, \
                          A, C )

#define ENQUEUE_FLASH_Syr2k( uplo, transA, alpha, A, B, beta, C, cntl ) \
        FLASH_Queue_push( (void *) FLA_Syr2k_task, \
                          (void *) cntl, \
                          "Syr2k", \
                          TRUE, \
                          TRUE, \
                          2, 2, 2, 1, \
                          uplo, transA, \
                          alpha, beta, \
                          A, B, C )

#define ENQUEUE_FLASH_Trmm( side, uplo, trans, diag, alpha, A, C, cntl ) \
        FLASH_Queue_push( (void *) FLA_Trmm_task, \
                          (void *) cntl, \
                          "Trmm ", \
                          TRUE, \
                          TRUE, \
                          4, 1, 1, 1, \
                          side, uplo, trans, diag, \
                          alpha, \
                          A, C )

#define ENQUEUE_FLASH_Trsm( side, uplo, trans, diag, alpha, A, C, cntl ) \
        FLASH_Queue_push( (void *) FLA_Trsm_task, \
                          (void *) cntl, \
                          "Trsm ", \
                          TRUE, \
                          TRUE, \
                          4, 1, 1, 1, \
                          side, uplo, trans, diag, \
                          alpha, \
                          A, C )

// Level-2 BLAS

#define ENQUEUE_FLASH_Gemv( trans, alpha, A, x, beta, y, cntl ) \
        FLASH_Queue_push( (void *) FLA_Gemv_task, \
                          (void *) cntl, \
                          "Gemv ", \
                          TRUE, \
                          TRUE, \
                          1, 2, 2, 1, \
                          trans, \
                          alpha, beta, \
                          A, x, y )

#define ENQUEUE_FLASH_Trsv( uplo, trans, diag, A, x, cntl ) \
        FLASH_Queue_push( (void *) FLA_Trsv_task, \
                          (void *) cntl, \
                          "Trsv ", \
                          TRUE, \
                          TRUE, \
                          3, 0, 1, 1, \
                          uplo, trans, diag, \
                          A, x )

// Level-1 BLAS

#define ENQUEUE_FLASH_Axpy( alpha, A, B, cntl ) \
        FLASH_Queue_push( (void *) FLA_Axpy_task, \
                          (void *) cntl, \
                          "Axpy ", \
                          TRUE, \
                          TRUE, \
                          0, 1, 1, 1, \
                          alpha, \
                          A, B )

#define ENQUEUE_FLASH_Axpyt( trans, alpha, A, B, cntl ) \
        FLASH_Queue_push( (void *) FLA_Axpyt_task, \
                          (void *) cntl, \
                          "Axpyt", \
                          FALSE, \
                          FALSE, \
                          1, 1, 1, 1, \
                          trans, \
                          alpha, \
                          A, B )

#define ENQUEUE_FLASH_Copy( A, B, cntl ) \
        FLASH_Queue_push( (void *) FLA_Copy_task, \
                          (void *) cntl, \
                          "Copy ", \
                          TRUE, \
                          TRUE, \
                          0, 0, 1, 1, \
                          A, B )

#define ENQUEUE_FLASH_Copyt( trans, A, B, cntl ) \
        FLASH_Queue_push( (void *) FLA_Copyt_task, \
                          (void *) cntl, \
                          "Copyt", \
                          FALSE, \
                          FALSE, \
                          1, 0, 1, 1, \
                          trans, \
                          A, B )

#define ENQUEUE_FLASH_Copyr( uplo, A, B, cntl ) \
        FLASH_Queue_push( (void *) FLA_Copyr_task, \
                          (void *) cntl, \
                          "Copyr", \
                          FALSE, \
                          TRUE, \
                          1, 0, 1, 1, \
                          uplo, \
                          A, B )

#define ENQUEUE_FLASH_Scal( alpha, A, cntl ) \
        FLASH_Queue_push( (void *) FLA_Scal_task, \
                          (void *) cntl, \
                          "Scal ", \
                          TRUE, \
                          TRUE, \
                          0, 1, 0, 1, \
                          alpha, \
                          A )

#define ENQUEUE_FLASH_Scalr( uplo, alpha, A, cntl ) \
        FLASH_Queue_push( (void *) FLA_Scalr_task, \
                          (void *) cntl, \
                          "Scalr", \
                          TRUE, \
                          TRUE, \
                          1, 1, 0, 1, \
                          uplo, \
                          alpha, \
                          A )

// Base

#define ENQUEUE_FLASH_Obj_create_buffer( rs, cs, A, cntl ) \
        FLASH_Queue_push( (void *) FLA_Obj_create_buffer_task, \
                          (void *) cntl, \
                          "Buff ", \
                          FALSE, \
                          FALSE, \
                          2, 0, 0, 1, \
                          rs, cs, \
                          A )

#define ENQUEUE_FLASH_Obj_free_buffer( A, cntl ) \
        FLASH_Queue_push( (void *) FLA_Obj_free_buffer_task, \
                          (void *) cntl, \
                          "Free ", \
                          FALSE, \
                          FALSE, \
                          0, 0, 0, 1, \
                          A )

#else

// LAPACK-level

#define ENQUEUE_FLASH_LU_piv_macro( A, p, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Apply_pivots_macro( side, trans, p, A, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_LU_piv( A, p, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_LU_piv_copy( A, p, U, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Trsm_piv( A, C, p, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_SA_LU( U, D, p, L, nb_alg, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_SA_FS( L, D, p, C, E, nb_alg, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_LU_nopiv( A, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Trinv( uplo, diag, A, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Ttmm( uplo, A, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Chol( uplo, A, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Sylv( transA, transB, isgn, A, B, C, scale, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Lyap( trans, isgn, A, C, scale, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_QR_UT_macro( A, T, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_QR_UT( A, T, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_QR_UT_copy( A, T, U, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_QR2_UT( B, D, T, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_LQ_UT_macro( A, T, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_CAQR2_UT( B, D, T, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_UDdate_UT( R, C, D, T, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Apply_Q_UT( side, trans, direct, storev, A, T, W, B, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Apply_Q2_UT( side, trans, direct, storev, D, T, W, C, E, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Apply_CAQ2_UT( side, trans, direct, storev, D, T, W, C, E, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Apply_QUD_UT( side, trans, direct, storev, T, W, R, U, C, V, D, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Eig_gest( inv, uplo, A, Y, B, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

// Level-3 BLAS

#define ENQUEUE_FLASH_Gemm( transA, transB, alpha, A, B, beta, C, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Hemm( side, uplo, alpha, A, B, beta, C, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Herk( uplo, transA, alpha, A, beta, C, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Her2k( uplo, transA, alpha, A, B, beta, C, cntl  ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Symm( side, uplo, alpha, A, B, beta, C, cntl  ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Syrk( uplo, transA, alpha, A, beta, C, cntl  ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Syr2k( uplo, transA, alpha, A, B, beta, C, cntl  ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Trmm( side, uplo, trans, diag, alpha, A, C, cntl  ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Trsm( side, uplo, trans, diag, alpha, A, C, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

// Level-2 BLAS

#define ENQUEUE_FLASH_Gemv( transA, alpha, A, x, beta, y, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Trsv( uplo, trans, diag, A, x, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

// Level-1 BLAS

#define ENQUEUE_FLASH_Axpy( alpha, A, B, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Axpyt( trans, alpha, A, B, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Copy( A, B, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Copyt( trans, A, B, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Copyr( uplo, A, B, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Scal( alpha, A, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Scalr( uplo, alpha, A, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

// Base

#define ENQUEUE_FLASH_Obj_create_buffer( rs, cs, A, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#define ENQUEUE_FLASH_Obj_free_buffer( A, cntl ) \
        FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED )

#endif // FLA_ENABLE_SUPERMATRIX


#endif // FLASH_QUEUE_MACRO_DEFS_H
// end FLASH_Queue_macro_defs.h

// begin FLASH_Queue_gpu.h


#ifndef FLASH_QUEUE_GPU_H
#define FLASH_QUEUE_GPU_H

#ifdef FLA_ENABLE_GPU


void           FLASH_Queue_init_gpu( void );
void           FLASH_Queue_finalize_gpu( void );

FLA_Error      FLASH_Queue_enable_gpu( void );
FLA_Error      FLASH_Queue_disable_gpu( void );
FLA_Bool       FLASH_Queue_get_enabled_gpu( void );


// --- helper functions -------------------------------------------------------

void           FLASH_Queue_set_gpu_num_blocks( dim_t n_blocks );
dim_t          FLASH_Queue_get_gpu_num_blocks( void );

FLA_Error      FLASH_Queue_bind_gpu( int thread );
FLA_Error      FLASH_Queue_alloc_gpu( dim_t size, FLA_Datatype datatype, void** buffer_gpu );
FLA_Error      FLASH_Queue_free_gpu( void* buffer_gpu );
FLA_Error      FLASH_Queue_write_gpu( FLA_Obj obj, void* buffer_gpu );
FLA_Error      FLASH_Queue_read_gpu( FLA_Obj obj, void* buffer_gpu );

void           FLASH_Queue_exec_task_gpu( FLASH_Task* t, void** input_arg, void** output_arg );


#endif

#endif // FLASH_QUEUE_GPU_H
// end FLASH_Queue_gpu.h
// begin FLASH_Queue_hip.h


#ifndef FLASH_QUEUE_HIP_H
#define FLASH_QUEUE_HIP_H

#ifdef FLA_ENABLE_HIP


void           FLASH_Queue_init_hip( void );
void           FLASH_Queue_finalize_hip( void );

FLA_Error      FLASH_Queue_enable_hip( void );
FLA_Error      FLASH_Queue_disable_hip( void );
FLA_Bool       FLASH_Queue_get_enabled_hip( void );


// --- helper functions -------------------------------------------------------

FLA_Error      FLASH_Queue_available_devices_hip( int* device_count );

FLA_Error      FLASH_Queue_enable_malloc_managed_hip( void );
FLA_Error      FLASH_Queue_disable_malloc_managed_hip( void );
FLA_Bool       FLASH_Queue_get_malloc_managed_enabled_hip( void );

void           FLASH_Queue_set_hip_num_blocks( dim_t n_blocks );
dim_t          FLASH_Queue_get_hip_num_blocks( void );

FLA_Error      FLASH_Queue_bind_hip( int thread );
FLA_Error      FLASH_Queue_alloc_async_hip( int thread, dim_t size, FLA_Datatype datatype, void** buffer_hip );
FLA_Error      FLASH_Queue_free_async_hip( int thread, void* buffer_hip );
FLA_Error      FLASH_Queue_write_async_hip( int thread, FLA_Obj obj, void* buffer_hip );
FLA_Error      FLASH_Queue_read_hip( int thread, FLA_Obj obj, void* buffer_hip );
FLA_Error      FLASH_Queue_read_async_hip( int thread, FLA_Obj obj, void* buffer_hip );
FLA_Error      FLASH_Queue_sync_stream_hip( int thread );
FLA_Error      FLASH_Queue_sync_device_hip( int device );
FLA_Error      FLASH_Queue_sync_hip( );

void           FLASH_Queue_exec_task_hip( FLASH_Task* t, void** input_arg, void** output_arg );


#endif

#endif // FLASH_QUEUE_HIP_H
// end FLASH_Queue_hip.h


#endif // FLASH_QUEUE_H
// end FLASH_Queue.h

  // Include Fortran name-mangling macro (if not already defined).
// begin FLA_f77_name_mangling.h


// --- Define Fortran name-mangling macro --------------------------------------

// If the F77_FUNC name-mangling macro is undefined, then we we need to define
// it ourselves.
#ifndef F77_FUNC

  // Case 1: F77_FUNC is undefined because we're building for Windows.
  #ifdef FLA_ENABLE_WINDOWS_BUILD

    // Check whether we need to use uppercase BLAS routine names; otherwise
    // default to lowercase.
    #ifdef FLA_ENABLE_UPPERCASE_BLAS

      // Use uppercase routine names (no underscore).
      #define F77_FUNC( name_lower, name_upper ) \
              name_upper
    #else

      // Use lowercase routine names (no underscore).
      #define F77_FUNC( name_lower, name_upper ) \
              name_lower
    #endif

  // Case 2: F77_FUNC is undefined because we're in a Linux-like environment
  // that did not define it for us.
  #else

    // Check whether we need to use uppercase BLAS routine names; otherwise
    // default to lowercase.
    #ifdef FLA_ENABLE_UPPERCASE_BLAS

      // Use uppercase routine names (single underscore).
      #define F77_FUNC( name_lower, name_upper ) \
              name_upper ## _
    #else

      // Use lowercase routine names (single underscore).
      #define F77_FUNC( name_lower, name_upper ) \
              name_lower ## _
    #endif

  #endif // #ifdef FLA_ENABLE_WINDOWS_BUILD

#endif // #ifndef F77_FUNC

// end FLA_f77_name_mangling.h

  // Include prototypes for LAPACK routines.
// begin FLA_lapack_f77_prototypes.h


// --- Name-mangling macro definitions -----------------------------------------

// --- Define Fortran name-mangling macro --------------------------

#define F77_spotrf F77_FUNC( spotrf , SPOTRF )
#define F77_dpotrf F77_FUNC( dpotrf , DPOTRF )
#define F77_cpotrf F77_FUNC( cpotrf , CPOTRF )
#define F77_zpotrf F77_FUNC( zpotrf , ZPOTRF )
      
#define F77_spotf2 F77_FUNC( spotf2 , SPOTF2 )
#define F77_dpotf2 F77_FUNC( dpotf2 , DPOTF2 )
#define F77_cpotf2 F77_FUNC( cpotf2 , CPOTF2 )
#define F77_zpotf2 F77_FUNC( zpotf2 , ZPOTF2 )
      
      
#define F77_sgetrf F77_FUNC( sgetrf , SGETRF )
#define F77_dgetrf F77_FUNC( dgetrf , DGETRF )
#define F77_cgetrf F77_FUNC( cgetrf , CGETRF )
#define F77_zgetrf F77_FUNC( zgetrf , ZGETRF )
      
#define F77_sgetf2 F77_FUNC( sgetf2 , SGETF2 )
#define F77_dgetf2 F77_FUNC( dgetf2 , DGETF2 )
#define F77_cgetf2 F77_FUNC( cgetf2 , CGETF2 )
#define F77_zgetf2 F77_FUNC( zgetf2 , ZGETF2 )
      
#define F77_sgeqrf F77_FUNC( sgeqrf , SGEQRF )
#define F77_dgeqrf F77_FUNC( dgeqrf , DGEQRF )
#define F77_cgeqrf F77_FUNC( cgeqrf , CGEQRF )
#define F77_zgeqrf F77_FUNC( zgeqrf , ZGEQRF )
      
#define F77_sgeqr2 F77_FUNC( sgeqr2 , SGEQR2 )
#define F77_dgeqr2 F77_FUNC( dgeqr2 , DGEQR2 )
#define F77_cgeqr2 F77_FUNC( cgeqr2 , CGEQR2 )
#define F77_zgeqr2 F77_FUNC( zgeqr2 , ZGEQR2 )

#define F77_sgeqrfp F77_FUNC( sgeqrfp , SGEQRFP )
#define F77_dgeqrfp F77_FUNC( dgeqrfp , DGEQRFP )
#define F77_cgeqrfp F77_FUNC( cgeqrfp , CGEQRFP )
#define F77_zgeqrfp F77_FUNC( zgeqrfp , ZGEQRFP )

#define F77_sgeqr2p F77_FUNC( sgeqr2p , SGEQR2P )
#define F77_dgeqr2p F77_FUNC( dgeqr2p , DGEQR2P )
#define F77_cgeqr2p F77_FUNC( cgeqr2p , CGEQR2P )
#define F77_zgeqr2p F77_FUNC( zgeqr2p , ZGEQR2P )
      
#define F77_sgeqr2 F77_FUNC( sgeqr2 , SGEQR2 )
#define F77_dgeqr2 F77_FUNC( dgeqr2 , DGEQR2 )
#define F77_cgeqr2 F77_FUNC( cgeqr2 , CGEQR2 )
#define F77_zgeqr2 F77_FUNC( zgeqr2 , ZGEQR2 )

#define F77_sgeqpf F77_FUNC( sgeqpf , SGEQPF )
#define F77_dgeqpf F77_FUNC( dgeqpf , DGEQPF )
#define F77_cgeqpf F77_FUNC( cgeqpf , CGEQPF )
#define F77_zgeqpf F77_FUNC( zgeqpf , ZGEQPF )
      
#define F77_sgeqp3 F77_FUNC( sgeqp3 , SGEQP3 )
#define F77_dgeqp3 F77_FUNC( dgeqp3 , DGEQP3 )
#define F77_cgeqp3 F77_FUNC( cgeqp3 , CGEQP3 )
#define F77_zgeqp3 F77_FUNC( zgeqp3 , ZGEQP3 )
      
#define F77_sgelsd F77_FUNC( sgelsd , SGELSD )
#define F77_dgelsd F77_FUNC( dgelsd , DGELSD )
#define F77_cgelsd F77_FUNC( cgelsd , CGELSD )
#define F77_zgelsd F77_FUNC( zgelsd , ZGELSD )

#define F77_sgelss F77_FUNC( sgelss , SGELSS )
#define F77_dgelss F77_FUNC( dgelss , DGELSS )
#define F77_cgelss F77_FUNC( cgelss , CGELSS )
#define F77_zgelss F77_FUNC( zgelss , ZGELSS )

#define F77_sgelqf F77_FUNC( sgelqf , SGELQF )
#define F77_dgelqf F77_FUNC( dgelqf , DGELQF )
#define F77_cgelqf F77_FUNC( cgelqf , CGELQF )
#define F77_zgelqf F77_FUNC( zgelqf , ZGELQF )
      
#define F77_sgelq2 F77_FUNC( sgelq2 , SGELQ2 )
#define F77_dgelq2 F77_FUNC( dgelq2 , DGELQ2 )
#define F77_cgelq2 F77_FUNC( cgelq2 , CGELQ2 )
#define F77_zgelq2 F77_FUNC( zgelq2 , ZGELQ2 )
      
#define F77_slauum F77_FUNC( slauum , SLAUUM )
#define F77_dlauum F77_FUNC( dlauum , DLAUUM )
#define F77_clauum F77_FUNC( clauum , CLAUUM )
#define F77_zlauum F77_FUNC( zlauum , ZLAUUM )
      
#define F77_slauu2 F77_FUNC( slauu2 , SLAUU2 )
#define F77_dlauu2 F77_FUNC( dlauu2 , DLAUU2 )
#define F77_clauu2 F77_FUNC( clauu2 , CLAUU2 )
#define F77_zlauu2 F77_FUNC( zlauu2 , ZLAUU2 )

#define F77_spotri F77_FUNC( spotri , SPOTRI )
#define F77_dpotri F77_FUNC( dpotri , DPOTRI )
#define F77_cpotri F77_FUNC( cpotri , CPOTRI )
#define F77_zpotri F77_FUNC( zpotri , ZPOTRI )
      
#define F77_strtri F77_FUNC( strtri , STRTRI )
#define F77_dtrtri F77_FUNC( dtrtri , DTRTRI )
#define F77_ctrtri F77_FUNC( ctrtri , CTRTRI )
#define F77_ztrtri F77_FUNC( ztrtri , ZTRTRI )
      
#define F77_strti2 F77_FUNC( strti2 , STRTI2 )
#define F77_dtrti2 F77_FUNC( dtrti2 , DTRTI2 )
#define F77_ctrti2 F77_FUNC( ctrti2 , CTRTI2 )
#define F77_ztrti2 F77_FUNC( ztrti2 , ZTRTI2 )
      
      
#define F77_strsyl F77_FUNC( strsyl , STRSYL )
#define F77_dtrsyl F77_FUNC( dtrsyl , DTRSYL )
#define F77_ctrsyl F77_FUNC( ctrsyl , CTRSYL )
#define F77_ztrsyl F77_FUNC( ztrsyl , ZTRSYL )
      
      
#define F77_sgehrd F77_FUNC( sgehrd , SGEHRD )
#define F77_dgehrd F77_FUNC( dgehrd , DGEHRD )
#define F77_cgehrd F77_FUNC( cgehrd , CGEHRD )
#define F77_zgehrd F77_FUNC( zgehrd , ZGEHRD )
      
#define F77_sgehd2 F77_FUNC( sgehd2 , SGEHD2 )
#define F77_dgehd2 F77_FUNC( dgehd2 , DGEHD2 )
#define F77_cgehd2 F77_FUNC( cgehd2 , CGEHD2 )
#define F77_zgehd2 F77_FUNC( zgehd2 , ZGEHD2 )
      
      
#define F77_ssytrd F77_FUNC( ssytrd , SSYTRD )
#define F77_dsytrd F77_FUNC( dsytrd , DSYTRD )
#define F77_chetrd F77_FUNC( chetrd , CHETRD )
#define F77_zhetrd F77_FUNC( zhetrd , ZHETRD )

      
#define F77_ssytd2 F77_FUNC( ssytd2 , SSYTD2 )
#define F77_dsytd2 F77_FUNC( dsytd2 , DSYTD2 )
#define F77_chetd2 F77_FUNC( chetd2 , CHETD2 )
#define F77_zhetd2 F77_FUNC( zhetd2 , ZHETD2 )
 
           
#define F77_sgebrd F77_FUNC( sgebrd , SGEBRD )
#define F77_dgebrd F77_FUNC( dgebrd , DGEBRD )
#define F77_cgebrd F77_FUNC( cgebrd , CGEBRD )
#define F77_zgebrd F77_FUNC( zgebrd , ZGEBRD )

      
#define F77_sgebd2 F77_FUNC( sgebd2 , SGEBD2 )
#define F77_dgebd2 F77_FUNC( dgebd2 , DGEBD2 )
#define F77_cgebd2 F77_FUNC( cgebd2 , CGEBD2 )
#define F77_zgebd2 F77_FUNC( zgebd2 , ZGEBD2 )
      
      
#define F77_ssygst F77_FUNC( ssygst , SSYGST )
#define F77_dsygst F77_FUNC( dsygst , DSYGST )
#define F77_chegst F77_FUNC( chegst , CHEGST )
#define F77_zhegst F77_FUNC( zhegst , ZHEGST )
      
#define F77_ssygs2 F77_FUNC( ssygs2 , SSYGS2 )
#define F77_dsygs2 F77_FUNC( dsygs2 , DSYGS2 )
#define F77_chegs2 F77_FUNC( chegs2 , CHEGS2 )
#define F77_zhegs2 F77_FUNC( zhegs2 , ZHEGS2 )
      
      
#define F77_slarft F77_FUNC( slarft , SLARFT )
#define F77_dlarft F77_FUNC( dlarft , DLARFT )
#define F77_clarft F77_FUNC( clarft , CLARFT )
#define F77_zlarft F77_FUNC( zlarft , ZLARFT )
      
      
#define F77_slarfg F77_FUNC( slarfg , SLARFG )
#define F77_dlarfg F77_FUNC( dlarfg , DLARFG )
#define F77_clarfg F77_FUNC( clarfg , CLARFG )
#define F77_zlarfg F77_FUNC( zlarfg , ZLARFG )

#define F77_slarfgp F77_FUNC( slarfgp , SLARFGP )
#define F77_dlarfgp F77_FUNC( dlarfgp , DLARFGP )
#define F77_clarfgp F77_FUNC( clarfgp , CLARFGP )
#define F77_zlarfgp F77_FUNC( zlarfgp , ZLARFGP )
      
#define F77_sorgqr F77_FUNC( sorgqr , SORGQR )
#define F77_dorgqr F77_FUNC( dorgqr , DORGQR )
#define F77_cungqr F77_FUNC( cungqr , CUNGQR )
#define F77_zungqr F77_FUNC( zungqr , ZUNGQR )

#define F77_sorg2r F77_FUNC( sorg2r , SORG2R )
#define F77_dorg2r F77_FUNC( dorg2r , DORG2R )
#define F77_cung2r F77_FUNC( cung2r , CUNG2R )
#define F77_zung2r F77_FUNC( zung2r , ZUNG2R )

#define F77_sormqr F77_FUNC( sormqr , SORMQR )
#define F77_dormqr F77_FUNC( dormqr , DORMQR )
#define F77_cunmqr F77_FUNC( cunmqr , CUNMQR )
#define F77_zunmqr F77_FUNC( zunmqr , ZUNMQR )

#define F77_sorm2r F77_FUNC( sorm2r , SORM2R )
#define F77_dorm2r F77_FUNC( dorm2r , DORM2R )
#define F77_cunm2r F77_FUNC( cunm2r , CUNM2R )
#define F77_zunm2r F77_FUNC( zunm2r , ZUNM2R )

#define F77_sormlq F77_FUNC( sormlq , SORMLQ )
#define F77_dormlq F77_FUNC( dormlq , DORMLQ )
#define F77_cunmlq F77_FUNC( cunmlq , CUNMLQ )
#define F77_zunmlq F77_FUNC( zunmlq , ZUNMLQ )

#define F77_sorml2 F77_FUNC( sorml2 , SORML2 )
#define F77_dorml2 F77_FUNC( dorml2 , DORML2 )
#define F77_cunml2 F77_FUNC( cunml2 , CUNML2 )
#define F77_zunml2 F77_FUNC( zunml2 , ZUNML2 )
      
#define F77_sorglq F77_FUNC( sorglq , SORGLQ )
#define F77_dorglq F77_FUNC( dorglq , DORGLQ )
#define F77_cunglq F77_FUNC( cunglq , CUNGLQ )
#define F77_zunglq F77_FUNC( zunglq , ZUNGLQ )

#define F77_sorgl2 F77_FUNC( sorgl2 , SORGL2 )
#define F77_dorgl2 F77_FUNC( dorgl2 , DORGL2 )
#define F77_cungl2 F77_FUNC( cungl2 , CUNGL2 )
#define F77_zungl2 F77_FUNC( zungl2 , ZUNGL2 )
      
#define F77_sorgtr F77_FUNC( sorgtr , SORGTR )
#define F77_dorgtr F77_FUNC( dorgtr , DORGTR )
#define F77_cungtr F77_FUNC( cungtr , CUNGTR )
#define F77_zungtr F77_FUNC( zungtr , ZUNGTR )
      
      
#define F77_sormtr F77_FUNC( sormtr , SORMTR )
#define F77_dormtr F77_FUNC( dormtr , DORMTR )
#define F77_cunmtr F77_FUNC( cunmtr , CUNMTR )
#define F77_zunmtr F77_FUNC( zunmtr , ZUNMTR )
      
      
#define F77_sorgbr F77_FUNC( sorgbr , SORGBR )
#define F77_dorgbr F77_FUNC( dorgbr , DORGBR )
#define F77_cungbr F77_FUNC( cungbr , CUNGBR )
#define F77_zungbr F77_FUNC( zungbr , ZUNGBR )
      
      
#define F77_sormbr F77_FUNC( sormbr , SORMBR )
#define F77_dormbr F77_FUNC( dormbr , DORMBR )
#define F77_cunmbr F77_FUNC( cunmbr , CUNMBR )
#define F77_zunmbr F77_FUNC( zunmbr , ZUNMBR )
      
      
#define F77_ssteqr F77_FUNC( ssteqr , SSTEQR )
#define F77_dsteqr F77_FUNC( dsteqr , DSTEQR )
#define F77_csteqr F77_FUNC( csteqr , CSTEQR )
#define F77_zsteqr F77_FUNC( zsteqr , ZSTEQR )
      
      
#define F77_sstedc F77_FUNC( sstedc , SSTEDC )
#define F77_dstedc F77_FUNC( dstedc , DSTEDC )
#define F77_cstedc F77_FUNC( cstedc , CSTEDC )
#define F77_zstedc F77_FUNC( zstedc , ZSTEDC )
      
      
#define F77_sstemr F77_FUNC( sstemr , SSTEMR )
#define F77_dstemr F77_FUNC( dstemr , DSTEMR )
#define F77_cstemr F77_FUNC( cstemr , CSTEMR )
#define F77_zstemr F77_FUNC( zstemr , ZSTEMR )
      
      
#define F77_ssyev  F77_FUNC( ssyev  , SSYEV  )
#define F77_dsyev  F77_FUNC( dsyev  , DSYEV  )
#define F77_cheev  F77_FUNC( cheev  , CHEEV  )
#define F77_zheev  F77_FUNC( zheev  , ZHEEV  )
      
      
#define F77_ssyevd F77_FUNC( ssyevd , SSYEVD )
#define F77_dsyevd F77_FUNC( dsyevd , DSYEVD )
#define F77_cheevd F77_FUNC( cheevd , CHEEVD )
#define F77_zheevd F77_FUNC( zheevd , ZHEEVD )
      
      
#define F77_ssyevr F77_FUNC( ssyevr , SSYEVR )
#define F77_dsyevr F77_FUNC( dsyevr , DSYEVR )
#define F77_cheevr F77_FUNC( cheevr , CHEEVR )
#define F77_zheevr F77_FUNC( zheevr , ZHEEVR )
      
      
#define F77_sbdsqr F77_FUNC( sbdsqr , SBDSQR )
#define F77_dbdsqr F77_FUNC( dbdsqr , DBDSQR )
#define F77_cbdsqr F77_FUNC( cbdsqr , CBDSQR )
#define F77_zbdsqr F77_FUNC( zbdsqr , ZBDSQR )
      
      
#define F77_sbdsdc F77_FUNC( sbdsdc , SBDSDC )
#define F77_dbdsdc F77_FUNC( dbdsdc , DBDSDC )
      
      
#define F77_sgesvd F77_FUNC( sgesvd , SGESVD )
#define F77_dgesvd F77_FUNC( dgesvd , DGESVD )
#define F77_cgesvd F77_FUNC( cgesvd , CGESVD )
#define F77_zgesvd F77_FUNC( zgesvd , ZGESVD )
      
      
#define F77_sgesdd F77_FUNC( sgesdd , SGESDD )
#define F77_dgesdd F77_FUNC( dgesdd , DGESDD )
#define F77_cgesdd F77_FUNC( cgesdd , CGESDD )
#define F77_zgesdd F77_FUNC( zgesdd , ZGESDD )
      
      
#define F77_slaswp F77_FUNC( slaswp , SLASWP )
#define F77_dlaswp F77_FUNC( dlaswp , DLASWP )
#define F77_claswp F77_FUNC( claswp , CLASWP )
#define F77_zlaswp F77_FUNC( zlaswp , ZLASWP )
      
      
#define F77_slaset F77_FUNC( slaset , SLASET )
#define F77_dlaset F77_FUNC( dlaset , DLASET )
#define F77_claset F77_FUNC( claset , CLASET )
#define F77_zlaset F77_FUNC( zlaset , ZLASET )
      

// --- Cholesky factorization ---

int F77_spotrf( char* uplo, int* n, float*    a, int* lda, int* info );
int F77_dpotrf( char* uplo, int* n, double*   a, int* lda, int* info );
int F77_cpotrf( char* uplo, int* n, scomplex* a, int* lda, int* info );
int F77_zpotrf( char* uplo, int* n, dcomplex* a, int* lda, int* info );

int F77_spotf2( char* uplo, int* n, float*    a, int* lda, int* info );
int F77_dpotf2( char* uplo, int* n, double*   a, int* lda, int* info );
int F77_cpotf2( char* uplo, int* n, scomplex* a, int* lda, int* info );
int F77_zpotf2( char* uplo, int* n, dcomplex* a, int* lda, int* info );

// --- LU factorization with partial pivoting ---

int F77_sgetrf( int* m, int* n, float*    a, int* lda, int* ipiv, int* info );
int F77_dgetrf( int* m, int* n, double*   a, int* lda, int* ipiv, int* info );
int F77_cgetrf( int* m, int* n, scomplex* a, int* lda, int* ipiv, int* info );
int F77_zgetrf( int* m, int* n, dcomplex* a, int* lda, int* ipiv, int* info );

int F77_sgetf2( int* m, int* n, float*    a, int* lda, int* ipiv, int* info );
int F77_dgetf2( int* m, int* n, double*   a, int* lda, int* ipiv, int* info );
int F77_cgetf2( int* m, int* n, scomplex* a, int* lda, int* ipiv, int* info );
int F77_zgetf2( int* m, int* n, dcomplex* a, int* lda, int* ipiv, int* info );

// --- QR factorization (classic) ---

int F77_sgeqrf( int* m, int* n, float*    a, int* lda, float*    tau, float*    work, int* lwork, int* info );
int F77_dgeqrf( int* m, int* n, double*   a, int* lda, double*   tau, double*   work, int* lwork, int* info );
int F77_cgeqrf( int* m, int* n, scomplex* a, int* lda, scomplex* tau, scomplex* work, int* lwork, int* info );
int F77_zgeqrf( int* m, int* n, dcomplex* a, int* lda, dcomplex* tau, dcomplex* work, int* lwork, int* info );

int F77_sgeqr2( int* m, int* n, float*    a, int* lda, float*    tau, float*    work, int* info );
int F77_dgeqr2( int* m, int* n, double*   a, int* lda, double*   tau, double*   work, int* info );
int F77_cgeqr2( int* m, int* n, scomplex* a, int* lda, scomplex* tau, scomplex* work, int* info );
int F77_zgeqr2( int* m, int* n, dcomplex* a, int* lda, dcomplex* tau, dcomplex* work, int* info );

int F77_sgeqpf( int* m, int* n, float*    a, int* lda, int* jpvt, float*    tau, float*    work,                int* info );
int F77_dgeqpf( int* m, int* n, double*   a, int* lda, int* jpvt, double*   tau, double*   work,                int* info );
int F77_cgeqpf( int* m, int* n, scomplex* a, int* lda, int* jpvt, scomplex* tau, scomplex* work, float*  rwork, int* info );
int F77_zgeqpf( int* m, int* n, dcomplex* a, int* lda, int* jpvt, dcomplex* tau, dcomplex* work, double* rwork, int* info );

int F77_sgeqp3( int* m, int* n, float*    a, int* lda, int* jpvt, float*    tau, float*    work, int* lwork,                int* info );
int F77_dgeqp3( int* m, int* n, double*   a, int* lda, int* jpvt, double*   tau, double*   work, int* lwork,                int* info );
int F77_cgeqp3( int* m, int* n, scomplex* a, int* lda, int* jpvt, scomplex* tau, scomplex* work, int* lwork, float*  rwork, int* info );
int F77_zgeqp3( int* m, int* n, dcomplex* a, int* lda, int* jpvt, dcomplex* tau, dcomplex* work, int* lwork, double* rwork, int* info );

// --- LQ factorization (classic) ---

int F77_sgelqf( int* m, int* n, float*    a, int* lda, float*    tau, float*    work, int* lwork, int* info );
int F77_dgelqf( int* m, int* n, double*   a, int* lda, double*   tau, double*   work, int* lwork, int* info );
int F77_cgelqf( int* m, int* n, scomplex* a, int* lda, scomplex* tau, scomplex* work, int* lwork, int* info );
int F77_zgelqf( int* m, int* n, dcomplex* a, int* lda, dcomplex* tau, dcomplex* work, int* lwork, int* info );

int F77_sgelq2( int* m, int* n, float*    a, int* lda, float*    tau, float*    work, int* info );
int F77_dgelq2( int* m, int* n, double*   a, int* lda, double*   tau, double*   work, int* info );
int F77_cgelq2( int* m, int* n, scomplex* a, int* lda, scomplex* tau, scomplex* work, int* info );
int F77_zgelq2( int* m, int* n, dcomplex* a, int* lda, dcomplex* tau, dcomplex* work, int* info );

// --- LS solver ---

int F77_sgelsd( int* m, int* n, int* nrhs, float*    a, int* lda, float*    b, int* ldb, float*  s, float*  rcond, int* rank, float*    work, int* lwork,                int* iwork, int* info );
int F77_dgelsd( int* m, int* n, int* nrhs, double*   a, int* lda, double*   b, int* ldb, double* s, double* rcond, int* rank, double*   work, int* lwork,                int* iwork, int* info );
int F77_cgelsd( int* m, int* n, int* nrhs, scomplex* a, int* lda, scomplex* b, int* ldb, float*  s, float*  rcond, int* rank, scomplex* work, int* lwork, float*  rwork, int* iwork, int* info );
int F77_zgelsd( int* m, int* n, int* nrhs, dcomplex* a, int* lda, dcomplex* b, int* ldb, double* s, double* rcond, int* rank, dcomplex* work, int* lwork, double* rwork, int* iwork, int* info );

int F77_sgelss( int* m, int* n, int* nrhs, float*    a, int* lda, float*    b, int* ldb, float*  s, float*  rcond, int* rank, float*    work, int* lwork,                int* info );
int F77_dgelss( int* m, int* n, int* nrhs, double*   a, int* lda, double*   b, int* ldb, double* s, double* rcond, int* rank, double*   work, int* lwork,                int* info );
int F77_cgelss( int* m, int* n, int* nrhs, scomplex* a, int* lda, scomplex* b, int* ldb, float*  s, float*  rcond, int* rank, scomplex* work, int* lwork, float*  rwork, int* info );
int F77_zgelss( int* m, int* n, int* nrhs, dcomplex* a, int* lda, dcomplex* b, int* ldb, double* s, double* rcond, int* rank, dcomplex* work, int* lwork, double* rwork, int* info );

// --- Triangular-transpose matrix multiply ---

int F77_slauum( char* uplo, int* n, float*    a, int* lda, int* info );
int F77_dlauum( char* uplo, int* n, double*   a, int* lda, int* info );
int F77_clauum( char* uplo, int* n, scomplex* a, int* lda, int* info );
int F77_zlauum( char* uplo, int* n, dcomplex* a, int* lda, int* info );

int F77_slauu2( char* uplo, int* n, float*    a, int* lda, int* info );
int F77_dlauu2( char* uplo, int* n, double*   a, int* lda, int* info );
int F77_clauu2( char* uplo, int* n, scomplex* a, int* lda, int* info );
int F77_zlauu2( char* uplo, int* n, dcomplex* a, int* lda, int* info );

// --- Symmetric (hermitian) positive definite matrix inversion ---

int F77_spotri( char* uplo, int*  n, float*    buff_A, int*  ldim_A, int*  info );
int F77_dpotri( char* uplo, int*  n, double*   buff_A, int*  ldim_A, int*  info );
int F77_cpotri( char* uplo, int*  n, scomplex* buff_A, int*  ldim_A, int*  info );
int F77_zpotri( char* uplo, int*  n, dcomplex* buff_A, int*  ldim_A, int*  info );

// --- Triangular matrix inversion ---

int F77_strtri( char* uplo, char* diag, int* n, float*    a, int* lda, int* info );
int F77_dtrtri( char* uplo, char* diag, int* n, double*   a, int* lda, int* info );
int F77_ctrtri( char* uplo, char* diag, int* n, scomplex* a, int* lda, int* info );
int F77_ztrtri( char* uplo, char* diag, int* n, dcomplex* a, int* lda, int* info );

int F77_strti2( char* uplo, char* diag, int* n, float*    a, int* lda, int* info );
int F77_dtrti2( char* uplo, char* diag, int* n, double*   a, int* lda, int* info );
int F77_ctrti2( char* uplo, char* diag, int* n, scomplex* a, int* lda, int* info );
int F77_ztrti2( char* uplo, char* diag, int* n, dcomplex* a, int* lda, int* info );

// --- Triangular Sylvester equation solve ---

int F77_strsyl( char* transa, char* transb, int* isgn, int* m, int* n, float*    a, int* lda, float*    b, int* ldb, float*    c, int* ldc, float*    scale, int* info );
int F77_dtrsyl( char* transa, char* transb, int* isgn, int* m, int* n, double*   a, int* lda, double*   b, int* ldb, double*   c, int* ldc, double*   scale, int* info );
int F77_ctrsyl( char* transa, char* transb, int* isgn, int* m, int* n, scomplex* a, int* lda, scomplex* b, int* ldb, scomplex* c, int* ldc, float*    scale, int* info );
int F77_ztrsyl( char* transa, char* transb, int* isgn, int* m, int* n, dcomplex* a, int* lda, dcomplex* b, int* ldb, dcomplex* c, int* ldc, double*   scale, int* info );

// --- Reduction to upper Hessenberg form ---

int F77_sgehrd( int* n, int* ilo, int* ihi, float*    a, int* lda, float*    tau, float*    work, int* lwork, int* info );
int F77_dgehrd( int* n, int* ilo, int* ihi, double*   a, int* lda, double*   tau, double*   work, int* lwork, int* info );
int F77_cgehrd( int* n, int* ilo, int* ihi, scomplex* a, int* lda, scomplex* tau, scomplex* work, int* lwork, int* info );
int F77_zgehrd( int* n, int* ilo, int* ihi, dcomplex* a, int* lda, dcomplex* tau, dcomplex* work, int* lwork, int* info );

int F77_sgehd2( int* n, int* ilo, int* ihi, float*    a, int* lda, float*    tau, float*    work, int* info );
int F77_dgehd2( int* n, int* ilo, int* ihi, double*   a, int* lda, double*   tau, double*   work, int* info );
int F77_cgehd2( int* n, int* ilo, int* ihi, scomplex* a, int* lda, scomplex* tau, scomplex* work, int* info );
int F77_zgehd2( int* n, int* ilo, int* ihi, dcomplex* a, int* lda, dcomplex* tau, dcomplex* work, int* info );

// --- Reduction to tridiagonal form ---

int F77_ssytrd( char* uplo, int* n, float*    a, int* lda, float*  d, float*  e, float*    tau, float*    work, int* lwork, int* info );
int F77_dsytrd( char* uplo, int* n, double*   a, int* lda, double* d, double* e, double*   tau, double*   work, int* lwork, int* info );
int F77_chetrd( char* uplo, int* n, scomplex* a, int* lda, float*  d, float*  e, scomplex* tau, scomplex* work, int* lwork, int* info );
int F77_zhetrd( char* uplo, int* n, dcomplex* a, int* lda, double* d, double* e, dcomplex* tau, dcomplex* work, int* lwork, int* info );

int F77_ssytd2( char* uplo, int* n, float*    a, int* lda, float*  d, float*  e, float*    tau, int* info );
int F77_dsytd2( char* uplo, int* n, double*   a, int* lda, double* d, double* e, double*   tau, int* info );
int F77_chetd2( char* uplo, int* n, scomplex* a, int* lda, float*  d, float*  e, scomplex* tau, int* info );
int F77_zhetd2( char* uplo, int* n, dcomplex* a, int* lda, double* d, double* e, dcomplex* tau, int* info );

// --- Reduction to bidiagonal form ---

int F77_sgebrd( int* m, int* n, float*    a, int* lda, float*  d, float*  e, float*    tauq, float*    taup, float*    work, int* lwork, int* info );
int F77_dgebrd( int* m, int* n, double*   a, int* lda, double* d, double* e, double*   tauq, double*   taup, double*   work, int* lwork, int* info );
int F77_cgebrd( int* m, int* n, scomplex* a, int* lda, float*  d, float*  e, scomplex* tauq, scomplex* taup, scomplex* work, int* lwork, int* info );
int F77_zgebrd( int* m, int* n, dcomplex* a, int* lda, double* d, double* e, dcomplex* tauq, dcomplex* taup, dcomplex* work, int* lwork, int* info );

int F77_sgebd2( int* m, int* n, float*    a, int* lda, float*  d, float*  e, float*    tauq, float*    taup, float*    work, int* info );
int F77_dgebd2( int* m, int* n, double*   a, int* lda, double* d, double* e, double*   tauq, double*   taup, double*   work, int* info );
int F77_cgebd2( int* m, int* n, scomplex* a, int* lda, float*  d, float*  e, scomplex* tauq, scomplex* taup, scomplex* work, int* info );
int F77_zgebd2( int* m, int* n, dcomplex* a, int* lda, double* d, double* e, dcomplex* tauq, dcomplex* taup, dcomplex* work, int* info );

// --- Reduce Hermitian-definite generalized eigenproblem to standard form ---

int F77_ssygst( int* itype, char* uplo, int* n, float*    a, int* lda, float*    b, int* ldb, int* info );
int F77_dsygst( int* itype, char* uplo, int* n, double*   a, int* lda, double*   b, int* ldb, int* info );
int F77_chegst( int* itype, char* uplo, int* n, scomplex* a, int* lda, scomplex* b, int* ldb, int* info );
int F77_zhegst( int* itype, char* uplo, int* n, dcomplex* a, int* lda, dcomplex* b, int* ldb, int* info );

int F77_ssygs2( int* itype, char* uplo, int* n, float*    a, int* lda, float*    b, int* ldb, int* info );
int F77_dsygs2( int* itype, char* uplo, int* n, double*   a, int* lda, double*   b, int* ldb, int* info );
int F77_chegs2( int* itype, char* uplo, int* n, scomplex* a, int* lda, scomplex* b, int* ldb, int* info );
int F77_zhegs2( int* itype, char* uplo, int* n, dcomplex* a, int* lda, dcomplex* b, int* ldb, int* info );

// --- Accumulate block Householder matrix T (classic) ---

int F77_slarft( char* direct, char* storev, int* n, int* k, float*    v, int* ldv, float*    tau, float*    t, int* ldt );
int F77_dlarft( char* direct, char* storev, int* n, int* k, double*   v, int* ldv, double*   tau, double*   t, int* ldt );
int F77_clarft( char* direct, char* storev, int* n, int* k, scomplex* v, int* ldv, scomplex* tau, scomplex* t, int* ldt );
int F77_zlarft( char* direct, char* storev, int* n, int* k, dcomplex* v, int* ldv, dcomplex* tau, dcomplex* t, int* ldt );

// --- Generate a Householder vector (classic) ---

int F77_slarfg( int* n, float*    alpha, float*    x, int* incx, float*    tau );
int F77_dlarfg( int* n, double*   alpha, double*   x, int* incx, double*   tau );
int F77_clarfg( int* n, scomplex* alpha, scomplex* x, int* incx, scomplex* tau );
int F77_zlarfg( int* n, dcomplex* alpha, dcomplex* x, int* incx, dcomplex* tau );

int F77_slarfgp( int* n, float*    alpha, float*    x, int* incx, float*    tau );
int F77_dlarfgp( int* n, double*   alpha, double*   x, int* incx, double*   tau );
int F77_clarfgp( int* n, scomplex* alpha, scomplex* x, int* incx, scomplex* tau );
int F77_zlarfgp( int* n, dcomplex* alpha, dcomplex* x, int* incx, dcomplex* tau );

// --- Form Q from QR factorization ---

int F77_sorgqr( int* m, int* n, int* k, float*    a, int* lda, float*    tau, float*    work, int* lwork, int* info );
int F77_dorgqr( int* m, int* n, int* k, double*   a, int* lda, double*   tau, double*   work, int* lwork, int* info );
int F77_cungqr( int* m, int* n, int* k, scomplex*   a, int* lda, scomplex*   tau, scomplex*   work, int* lwork, int* info );
int F77_zungqr( int* m, int* n, int* k, dcomplex*   a, int* lda, dcomplex*   tau, dcomplex*   work, int* lwork, int* info );

// --- Apply Q or Q' from QR factorization ---

int F77_sormqr( char* side, char* trans, int* m, int* n, int* k, float*    a, int* lda, float*    tau, float*    c, int* ldc, float*    work, int* lwork, int* info );
int F77_dormqr( char* side, char* trans, int* m, int* n, int* k, double*   a, int* lda, double*   tau, double*   c, int* ldc, double*   work, int* lwork, int* info );
int F77_cunmqr( char* side, char* trans, int* m, int* n, int* k, scomplex*   a, int* lda, scomplex*   tau, scomplex*   c, int* ldc, scomplex*   work, int* lwork, int* info );
int F77_zunmqr( char* side, char* trans, int* m, int* n, int* k, dcomplex*   a, int* lda, dcomplex*   tau, dcomplex*   c, int* ldc, dcomplex*   work, int* lwork, int* info );

int F77_sorm2r( char* side, char* trans, int* m, int* n, int* k, float*    a, int* lda, float*    tau, float*    c, int* ldc, float*    work, int* info );
int F77_dorm2r( char* side, char* trans, int* m, int* n, int* k, double*   a, int* lda, double*   tau, double*   c, int* ldc, double*   work, int* info );
int F77_cunm2r( char* side, char* trans, int* m, int* n, int* k, scomplex*   a, int* lda, scomplex*   tau, scomplex*   c, int* ldc, scomplex*   work, int* info );
int F77_zunm2r( char* side, char* trans, int* m, int* n, int* k, dcomplex*   a, int* lda, dcomplex*   tau, dcomplex*   c, int* ldc, dcomplex*   work, int* info );

// --- Form Q from LQ factorization ---

int F77_sorglq( int* m, int* n, int* k, float*    a, int* lda, float*    tau, float*    work, int* lwork, int* info );
int F77_dorglq( int* m, int* n, int* k, double*   a, int* lda, double*   tau, double*   work, int* lwork, int* info );
int F77_cunglq( int* m, int* n, int* k, scomplex* a, int* lda, scomplex* tau, scomplex* work, int* lwork, int* info );
int F77_zunglq( int* m, int* n, int* k, dcomplex* a, int* lda, dcomplex* tau, dcomplex* work, int* lwork, int* info );

// --- Apply Q or Q' from LQ factorization ---

int F77_sormlq( char* side, char* trans, int* m, int* n, int* k, float*    a, int* lda, float*    tau, float*    c, int* ldc, float*    work, int* lwork, int* info );
int F77_dormlq( char* side, char* trans, int* m, int* n, int* k, double*   a, int* lda, double*   tau, double*   c, int* ldc, double*   work, int* lwork, int* info );
int F77_cunmlq( char* side, char* trans, int* m, int* n, int* k, scomplex*   a, int* lda, scomplex*   tau, scomplex*   c, int* ldc, scomplex*   work, int* lwork, int* info );
int F77_zunmlq( char* side, char* trans, int* m, int* n, int* k, dcomplex*   a, int* lda, dcomplex*   tau, dcomplex*   c, int* ldc, dcomplex*   work, int* lwork, int* info );

int F77_sorml2( char* side, char* trans, int* m, int* n, int* k, float*    a, int* lda, float*    tau, float*    c, int* ldc, float*    work, int* info );
int F77_dorml2( char* side, char* trans, int* m, int* n, int* k, double*   a, int* lda, double*   tau, double*   c, int* ldc, double*   work, int* info );
int F77_cunml2( char* side, char* trans, int* m, int* n, int* k, scomplex*   a, int* lda, scomplex*   tau, scomplex*   c, int* ldc, scomplex*   work, int* info );
int F77_zunml2( char* side, char* trans, int* m, int* n, int* k, dcomplex*   a, int* lda, dcomplex*   tau, dcomplex*   c, int* ldc, dcomplex*   work, int* info );

// --- Form Q from tridiagonal reduction ---

int F77_sorgtr( char* uplo, int* m, float*    a, int* lda, float*    tau, float*    work, int* lwork, int* info );
int F77_dorgtr( char* uplo, int* m, double*   a, int* lda, double*   tau, double*   work, int* lwork, int* info );
int F77_cungtr( char* uplo, int* m, scomplex* a, int* lda, scomplex* tau, scomplex* work, int* lwork, int* info );
int F77_zungtr( char* uplo, int* m, dcomplex* a, int* lda, dcomplex* tau, dcomplex* work, int* lwork, int* info );

// --- Apply Q or Q' from tridiagonal reduction ---

int F77_sormtr( char* side, char* uplo, char* trans, int* m, int* n, float*    a, int* lda, float*    tau, float*    c, int* ldc, float*    work, int* lwork, int* info );
int F77_dormtr( char* side, char* uplo, char* trans, int* m, int* n, double*   a, int* lda, double*   tau, double*   c, int* ldc, double*   work, int* lwork, int* info );
int F77_cunmtr( char* side, char* uplo, char* trans, int* m, int* n, scomplex* a, int* lda, scomplex* tau, scomplex* c, int* ldc, scomplex* work, int* lwork, int* info );
int F77_zunmtr( char* side, char* uplo, char* trans, int* m, int* n, dcomplex* a, int* lda, dcomplex* tau, dcomplex* c, int* ldc, dcomplex* work, int* lwork, int* info );

// --- Form Q from bidiagonal reduction ---

int F77_sorgbr( char* vect, int* m, int* n, int* k, float*    a, int* lda, float*    tau, float*    work, int* lwork, int* info );
int F77_dorgbr( char* vect, int* m, int* n, int* k, double*   a, int* lda, double*   tau, double*   work, int* lwork, int* info );
int F77_cungbr( char* vect, int* m, int* n, int* k, scomplex* a, int* lda, scomplex* tau, scomplex* work, int* lwork, int* info );
int F77_zungbr( char* vect, int* m, int* n, int* k, dcomplex* a, int* lda, dcomplex* tau, dcomplex* work, int* lwork, int* info );

// --- Apply Q or Q' from bidiagonal reduction ---

int F77_sormbr( char* vect, char* side, char* trans, int* m, int* n, int* k, float*    a, int* lda, float*    tau, float*    c, int* ldc, float*    work, int* lwork, int* info );
int F77_dormbr( char* vect, char* side, char* trans, int* m, int* n, int* k, double*   a, int* lda, double*   tau, double*   c, int* ldc, double*   work, int* lwork, int* info );
int F77_cunmbr( char* vect, char* side, char* trans, int* m, int* n, int* k, scomplex* a, int* lda, scomplex* tau, scomplex* c, int* ldc, scomplex* work, int* lwork, int* info );
int F77_zunmbr( char* vect, char* side, char* trans, int* m, int* n, int* k, dcomplex* a, int* lda, dcomplex* tau, dcomplex* c, int* ldc, dcomplex* work, int* lwork, int* info );

// --- Tridiagonal QR algorithm ---

int F77_ssteqr( char* jobz, int* n, float*    d, float*    e, float*    z, int* ldz, float*  work, int* info ); 
int F77_dsteqr( char* jobz, int* n, double*   d, double*   e, double*   z, int* ldz, double* work, int* info ); 
int F77_csteqr( char* jobz, int* n, float*    d, float*    e, scomplex* z, int* ldz, float*  work, int* info ); 
int F77_zsteqr( char* jobz, int* n, double*   d, double*   e, dcomplex* z, int* ldz, double* work, int* info ); 

// --- Tridiagonal divide-and-conquer algorithm ---

int F77_sstedc( char* compz, int* n, float*    d, float*    e, float*    z, int* ldz, float*    work, int* lwork,                             int* iwork, int* liwork, int* info );
int F77_dstedc( char* compz, int* n, double*   d, double*   e, double*   z, int* ldz, double*   work, int* lwork,                             int* iwork, int* liwork, int* info );
int F77_cstedc( char* compz, int* n, float*    d, float*    e, scomplex* z, int* ldz, scomplex* work, int* lwork, float*  rwork, int* lrwork, int* iwork, int* liwork, int* info );
int F77_zstedc( char* compz, int* n, double*   d, double*   e, dcomplex* z, int* ldz, dcomplex* work, int* lwork, double* rwork, int* lrwork, int* iwork, int* liwork, int* info );

// --- Tridiagonal MRRR algorithm ---

int F77_sstemr( char* jobz, char* range, int* n, float*  d, float*  e, int* vl, int* vu, int* il, int* iu, int* m, float*  w, float*    z, int* ldz, int* nzc, int* isuppz, int* tryrac, float*  work, int* lwork, int* iwork, int* liwork, int* info );
int F77_dstemr( char* jobz, char* range, int* n, double* d, double* e, int* vl, int* vu, int* il, int* iu, int* m, double* w, double*   z, int* ldz, int* nzc, int* isuppz, int* tryrac, double* work, int* lwork, int* iwork, int* liwork, int* info );
int F77_cstemr( char* jobz, char* range, int* n, float*  d, float*  e, int* vl, int* vu, int* il, int* iu, int* m, float*  w, scomplex* z, int* ldz, int* nzc, int* isuppz, int* tryrac, float*  work, int* lwork, int* iwork, int* liwork, int* info );
int F77_zstemr( char* jobz, char* range, int* n, double* d, double* e, int* vl, int* vu, int* il, int* iu, int* m, double* w, dcomplex* z, int* ldz, int* nzc, int* isuppz, int* tryrac, double* work, int* lwork, int* iwork, int* liwork, int* info );

// --- Hermitian eigenvalue decomposition (QR algorithm) ---

int F77_ssyev( char* jobz, char* uplo, int* n, float*    a, int* lda, float*  w, float*    work, int* lwork, float*  rwork, int* info ); 
int F77_dsyev( char* jobz, char* uplo, int* n, double*   a, int* lda, double* w, double*   work, int* lwork, double* rwork, int* info ); 
int F77_cheev( char* jobz, char* uplo, int* n, scomplex* a, int* lda, float*  w, scomplex* work, int* lwork, float*  rwork, int* info ); 
int F77_zheev( char* jobz, char* uplo, int* n, dcomplex* a, int* lda, double* w, dcomplex* work, int* lwork, double* rwork, int* info ); 

// --- Hermitian eigenvalue decomposition (divide-and-conquer) ---

int F77_ssyevd( char* jobz, char* uplo, int* n, float*    a, int* lda, float*  w, float*    work, int* lwork,                             int* iwork, int* liwork, int* info ); 
int F77_dsyevd( char* jobz, char* uplo, int* n, double*   a, int* lda, double* w, double*   work, int* lwork,                             int* iwork, int* liwork, int* info ); 
int F77_cheevd( char* jobz, char* uplo, int* n, scomplex* a, int* lda, float*  w, scomplex* work, int* lwork, float*  rwork, int* lrwork, int* iwork, int* liwork, int* info ); 
int F77_zheevd( char* jobz, char* uplo, int* n, dcomplex* a, int* lda, double* w, dcomplex* work, int* lwork, double* rwork, int* lrwork, int* iwork, int* liwork, int* info ); 

// --- Hermitian eigenvalue decomposition (MRRR) ---

int F77_ssyevr( char* jobz, char* range, char* uplo, int* n, float*    a, int* lda, float*  vl, float*  vu, int* il, int* iu, float*  abstol, int* m, float*  w, float*    z, int* ldz, int* isuppz, float*    work, int* lwork,                             int* iwork, int* liwork, int* info ); 
int F77_dsyevr( char* jobz, char* range, char* uplo, int* n, double*   a, int* lda, double* vl, double* vu, int* il, int* iu, double* abstol, int* m, double* w, double*   z, int* ldz, int* isuppz, double*   work, int* lwork,                             int* iwork, int* liwork, int* info ); 
int F77_cheevr( char* jobz, char* range, char* uplo, int* n, scomplex* a, int* lda, float*  vl, float*  vu, int* il, int* iu, float*  abstol, int* m, float*  w, scomplex* z, int* ldz, int* isuppz, scomplex* work, int* lwork, float*  rwork, int* lrwork, int* iwork, int* liwork, int* info ); 
int F77_zheevr( char* jobz, char* range, char* uplo, int* n, dcomplex* a, int* lda, double* vl, double* vu, int* il, int* iu, double* abstol, int* m, double* w, dcomplex* z, int* ldz, int* isuppz, dcomplex* work, int* lwork, double* rwork, int* lrwork, int* iwork, int* liwork, int* info ); 

// --- Bidiagonal QR algorithm ---

int F77_sbdsqr( char* uplo, int* n, int* ncvt, int* nru, int* ncc, float*    d, float*    e, float*    vt, int* ldvt, float*    u, int* ldu, float*    c, int* ldc, float*  rwork, int* info ); 
int F77_dbdsqr( char* uplo, int* n, int* ncvt, int* nru, int* ncc, double*   d, double*   e, double*   vt, int* ldvt, double*   u, int* ldu, double*   c, int* ldc, double* rwork, int* info ); 
int F77_cbdsqr( char* uplo, int* n, int* ncvt, int* nru, int* ncc, float*    d, float*    e, scomplex* vt, int* ldvt, scomplex* u, int* ldu, scomplex* c, int* ldc, float*  rwork, int* info ); 
int F77_zbdsqr( char* uplo, int* n, int* ncvt, int* nru, int* ncc, double*   d, double*   e, dcomplex* vt, int* ldvt, dcomplex* u, int* ldu, dcomplex* c, int* ldc, double* rwork, int* info ); 

// --- Bidiagonal divide-and-conquor algorithm ---

int F77_sbdsdc( char* uplo, char* compq, int* n, float*  d, float*  e, float*  u, int* ldu, float*  vt, int* ldvt, float*  q, float*  iq, float*  work, int* iwork, int* info ); 
int F77_dbdsdc( char* uplo, char* compq, int* n, double* d, double* e, double* u, int* ldu, double* vt, int* ldvt, double* q, double* iq, double* work, int* iwork, int* info ); 

// --- General matrix singular value decomposition (QR algorithm) ---

int F77_sgesvd( char* jobu, char* jobv, int* m, int* n, float*    a, int* lda, float*  s, float*    u, int* ldu, float*    vt, int* ldvt, float*    work, int* lwork,                int* info );
int F77_dgesvd( char* jobu, char* jobv, int* m, int* n, double*   a, int* lda, double* s, double*   u, int* ldu, double*   vt, int* ldvt, double*   work, int* lwork,                int* info );
int F77_cgesvd( char* jobu, char* jobv, int* m, int* n, scomplex* a, int* lda, float*  s, scomplex* u, int* ldu, scomplex* vt, int* ldvt, scomplex* work, int* lwork, float*  rwork, int* info );
int F77_zgesvd( char* jobu, char* jobv, int* m, int* n, dcomplex* a, int* lda, double* s, dcomplex* u, int* ldu, dcomplex* vt, int* ldvt, dcomplex* work, int* lwork, double* rwork, int* info );

// --- General matrix singular value decomposition (divide-and-conquer) ---

int F77_sgesdd( char* jobz, int* m, int* n, float*    a, int* lda, float*  s, float*    u, int* ldu, float*    vt, int* ldvt, float*    work, int* lwork,                int* iwork, int* info );
int F77_dgesdd( char* jobz, int* m, int* n, double*   a, int* lda, double* s, double*   u, int* ldu, double*   vt, int* ldvt, double*   work, int* lwork,                int* iwork, int* info );
int F77_cgesdd( char* jobz, int* m, int* n, scomplex* a, int* lda, float*  s, scomplex* u, int* ldu, scomplex* vt, int* ldvt, scomplex* work, int* lwork, float*  rwork, int* iwork, int* info );
int F77_zgesdd( char* jobz, int* m, int* n, dcomplex* a, int* lda, double* s, dcomplex* u, int* ldu, dcomplex* vt, int* ldvt, dcomplex* work, int* lwork, double* rwork, int* iwork, int* info );

// --- Swap rows ---

int F77_slaswp( int* n, float*    a, int* lda, int* k1, int* k2, int* ipiv, int* incx );
int F77_dlaswp( int* n, double*   a, int* lda, int* k1, int* k2, int* ipiv, int* incx );
int F77_claswp( int* n, scomplex* a, int* lda, int* k1, int* k2, int* ipiv, int* incx );
int F77_zlaswp( int* n, dcomplex* a, int* lda, int* k1, int* k2, int* ipiv, int* incx );

// --- Initialize a matrix ---

int F77_slaset( char* uplo, int* m, int* n, float*    alpha, float*    beta, float*    a, int* lda );
int F77_dlaset( char* uplo, int* m, int* n, double*   alpha, double*   beta, double*   a, int* lda );
int F77_claset( char* uplo, int* m, int* n, scomplex* alpha, scomplex* beta, scomplex* a, int* lda );
int F77_zlaset( char* uplo, int* m, int* n, dcomplex* alpha, dcomplex* beta, dcomplex* a, int* lda );

// end FLA_lapack_f77_prototypes.h

  // Include prototypes for LAPACK routines.
  //#include "FLA_lapack_f77_macro_defs.h"

  // Include prototypes for FLASH get/sets.
// begin FLASH_get_set_controls.h


#ifndef FLASH_GET_SET_CONTROLS_H
#define FLASH_GET_SET_CONTROLS_H

#ifdef FLA_ENABLE_LAPACK2FLASH // Start lapack2flash

FLA_Error      FLASH_set_preferred_blocksize( dim_t blocksize );
dim_t          FLASH_get_preferred_blocksize( void );
FLA_Error      FLASH_set_n_preferred_threads( unsigned int threads );
unsigned int   FLASH_get_n_preferred_threads( void );
FLA_Error      FLASH_set_depth( dim_t depth );
dim_t          FLASH_get_depth( void );
FLA_Error      FLASH_set_tile_offload( unsigned int tiles );
unsigned int   FLASH_get_tile_offload( void );

#endif // End lapack2flash

#endif // End header// end FLASH_get_set_controls.h

// End extern "C" construct block.
#ifdef __cplusplus
}
#endif 

#endif

